Message ID | 20210604163506.2103900-1-nsaenzju@redhat.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | mm: mmap_lock: Use local locks instead of disabling preemption | expand |
I applied this patch and tried it out (*without* PREEMPT_RT), and it works fine. Reading through the code carefully, it looks correct to me. I'm not as familiar with testing PREEMPT_RT kernels though, so likely someone with more familiarity should take a look. Feel free to take: Reviewed-by: Axel Rasmussen <axelrasmussen@google.com> On Fri, Jun 4, 2021 at 9:35 AM Nicolas Saenz Julienne <nsaenzju@redhat.com> wrote: > > mmap_lock will explicitly disable/enable preemption upon manipulating > its local CPU variables. This is to be expected, but in this case, it > doesn't play well with PREEMPT_RT. The preemption disabled code section > also takes a spin-lock. Spin-locks in RT systems will try to schedule, > which is exactly what we're trying to avoid. > > To mitigate this, convert the explicit preemption handling to > local_locks. Which are RT aware, and will disable migration instead of > preemption when PREEMPT_RT=y. > > The faulty call trace looks like the following: > __mmap_lock_do_trace_*() > preempt_disable() > get_mm_memcg_path() > cgroup_path() > kernfs_path_from_node() > spin_lock_irqsave() /* Scheduling while atomic! */ > > Fixes: 2b5067a8143e3 ("mm: mmap_lock: add tracepoints around lock acquisition ") > Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> > --- > mm/mmap_lock.c | 33 ++++++++++++++++++++++----------- > 1 file changed, 22 insertions(+), 11 deletions(-) > > diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c > index dcdde4f722a4..2ae3f33b85b1 100644 > --- a/mm/mmap_lock.c > +++ b/mm/mmap_lock.c > @@ -11,6 +11,7 @@ > #include <linux/rcupdate.h> > #include <linux/smp.h> > #include <linux/trace_events.h> > +#include <linux/local_lock.h> > > EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); > EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); > @@ -39,21 +40,30 @@ static int reg_refcount; /* Protected by reg_lock. */ > */ > #define CONTEXT_COUNT 4 > > -static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); > +struct memcg_path { > + local_lock_t lock; > + char __rcu *buf; > + local_t buf_idx; > +}; > +static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { > + .lock = INIT_LOCAL_LOCK(lock), > + .buf_idx = LOCAL_INIT(0), > +}; > + > static char **tmp_bufs; > -static DEFINE_PER_CPU(int, memcg_path_buf_idx); > > /* Called with reg_lock held. */ > static void free_memcg_path_bufs(void) > { > + struct memcg_path *memcg_path; > int cpu; > char **old = tmp_bufs; > > for_each_possible_cpu(cpu) { > - *(old++) = rcu_dereference_protected( > - per_cpu(memcg_path_buf, cpu), > + memcg_path = per_cpu_ptr(&memcg_paths, cpu); > + *(old++) = rcu_dereference_protected(memcg_path->buf, > lockdep_is_held(®_lock)); > - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); > + rcu_assign_pointer(memcg_path->buf, NULL); > } > > /* Wait for inflight memcg_path_buf users to finish. */ > @@ -88,7 +98,7 @@ int trace_mmap_lock_reg(void) > new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); > if (new == NULL) > goto out_fail_free; > - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); > + rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); > /* Don't need to wait for inflights, they'd have gotten NULL. */ > } > > @@ -122,23 +132,24 @@ void trace_mmap_lock_unreg(void) > > static inline char *get_memcg_path_buf(void) > { > + struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); > char *buf; > int idx; > > rcu_read_lock(); > - buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); > + buf = rcu_dereference(memcg_path->buf); > if (buf == NULL) { > rcu_read_unlock(); > return NULL; > } > - idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - > + idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - > MEMCG_PATH_BUF_SIZE; > return &buf[idx]; > } > > static inline void put_memcg_path_buf(void) > { > - this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); > + local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); > rcu_read_unlock(); > } > > @@ -179,14 +190,14 @@ static const char *get_mm_memcg_path(struct mm_struct *mm) > #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ > do { \ > const char *memcg_path; \ > - preempt_disable(); \ > + local_lock(&memcg_paths.lock); \ > memcg_path = get_mm_memcg_path(mm); \ > trace_mmap_lock_##type(mm, \ > memcg_path != NULL ? memcg_path : "", \ > ##__VA_ARGS__); \ > if (likely(memcg_path != NULL)) \ > put_memcg_path_buf(); \ > - preempt_enable(); \ > + local_unlock(&memcg_paths.lock); \ > } while (0) > > #else /* !CONFIG_MEMCG */ > -- > 2.31.1 >
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index dcdde4f722a4..2ae3f33b85b1 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -11,6 +11,7 @@ #include <linux/rcupdate.h> #include <linux/smp.h> #include <linux/trace_events.h> +#include <linux/local_lock.h> EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking); EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned); @@ -39,21 +40,30 @@ static int reg_refcount; /* Protected by reg_lock. */ */ #define CONTEXT_COUNT 4 -static DEFINE_PER_CPU(char __rcu *, memcg_path_buf); +struct memcg_path { + local_lock_t lock; + char __rcu *buf; + local_t buf_idx; +}; +static DEFINE_PER_CPU(struct memcg_path, memcg_paths) = { + .lock = INIT_LOCAL_LOCK(lock), + .buf_idx = LOCAL_INIT(0), +}; + static char **tmp_bufs; -static DEFINE_PER_CPU(int, memcg_path_buf_idx); /* Called with reg_lock held. */ static void free_memcg_path_bufs(void) { + struct memcg_path *memcg_path; int cpu; char **old = tmp_bufs; for_each_possible_cpu(cpu) { - *(old++) = rcu_dereference_protected( - per_cpu(memcg_path_buf, cpu), + memcg_path = per_cpu_ptr(&memcg_paths, cpu); + *(old++) = rcu_dereference_protected(memcg_path->buf, lockdep_is_held(®_lock)); - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL); + rcu_assign_pointer(memcg_path->buf, NULL); } /* Wait for inflight memcg_path_buf users to finish. */ @@ -88,7 +98,7 @@ int trace_mmap_lock_reg(void) new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL); if (new == NULL) goto out_fail_free; - rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new); + rcu_assign_pointer(per_cpu_ptr(&memcg_paths, cpu)->buf, new); /* Don't need to wait for inflights, they'd have gotten NULL. */ } @@ -122,23 +132,24 @@ void trace_mmap_lock_unreg(void) static inline char *get_memcg_path_buf(void) { + struct memcg_path *memcg_path = this_cpu_ptr(&memcg_paths); char *buf; int idx; rcu_read_lock(); - buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf)); + buf = rcu_dereference(memcg_path->buf); if (buf == NULL) { rcu_read_unlock(); return NULL; } - idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) - + idx = local_add_return(MEMCG_PATH_BUF_SIZE, &memcg_path->buf_idx) - MEMCG_PATH_BUF_SIZE; return &buf[idx]; } static inline void put_memcg_path_buf(void) { - this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE); + local_sub(MEMCG_PATH_BUF_SIZE, &this_cpu_ptr(&memcg_paths)->buf_idx); rcu_read_unlock(); } @@ -179,14 +190,14 @@ static const char *get_mm_memcg_path(struct mm_struct *mm) #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ do { \ const char *memcg_path; \ - preempt_disable(); \ + local_lock(&memcg_paths.lock); \ memcg_path = get_mm_memcg_path(mm); \ trace_mmap_lock_##type(mm, \ memcg_path != NULL ? memcg_path : "", \ ##__VA_ARGS__); \ if (likely(memcg_path != NULL)) \ put_memcg_path_buf(); \ - preempt_enable(); \ + local_unlock(&memcg_paths.lock); \ } while (0) #else /* !CONFIG_MEMCG */
mmap_lock will explicitly disable/enable preemption upon manipulating its local CPU variables. This is to be expected, but in this case, it doesn't play well with PREEMPT_RT. The preemption disabled code section also takes a spin-lock. Spin-locks in RT systems will try to schedule, which is exactly what we're trying to avoid. To mitigate this, convert the explicit preemption handling to local_locks. Which are RT aware, and will disable migration instead of preemption when PREEMPT_RT=y. The faulty call trace looks like the following: __mmap_lock_do_trace_*() preempt_disable() get_mm_memcg_path() cgroup_path() kernfs_path_from_node() spin_lock_irqsave() /* Scheduling while atomic! */ Fixes: 2b5067a8143e3 ("mm: mmap_lock: add tracepoints around lock acquisition ") Signed-off-by: Nicolas Saenz Julienne <nsaenzju@redhat.com> --- mm/mmap_lock.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-)