Message ID | 20241120000826.335387-3-surenb@google.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | move per-vma lock into vm_area_struct | expand |
On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote: > Back when per-vma locks were introduces, vm_lock was moved out of > vm_area_struct in [1] because of the performance regression caused by > false cacheline sharing. Recent investigation [2] revealed that the > regressions is limited to a rather old Broadwell microarchitecture and > even there it can be mitigated by disabling adjacent cacheline > prefetching, see [3]. > Splitting single logical structure into multiple ones leads to more > complicated management, extra pointer dereferences and overall less > maintainable code. When that split-away part is a lock, it complicates > things even further. With no performance benefits, there are no reasons > for this split. Merging the vm_lock back into vm_area_struct also allows > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. > Move vm_lock back into vm_area_struct, aligning it at the cacheline > boundary and changing the cache to be cacheline-aligned as well. > With kernel compiled using defconfig, this causes VMA memory consumption > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: > > slabinfo before: > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vma_lock ... 40 102 1 : ... > vm_area_struct ... 160 51 2 : ... > > slabinfo after moving vm_lock: > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vm_area_struct ... 256 32 2 : ... > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, > which is 5.5MB per 100000 VMAs. Note that the size of this structure is > dependent on the kernel configuration and typically the original size is > higher than 160 bytes. Therefore these calculations are close to the > worst case scenario. A more realistic vm_area_struct usage before this > change is: > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > vma_lock ... 40 102 1 : ... > vm_area_struct ... 176 46 2 : ... > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, > which is 3.9MB per 100000 VMAs. > This memory consumption growth can be addressed later by optimizing the > vm_lock. > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> One question below. > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -716,8 +716,6 @@ struct vm_area_struct { > * slowpath. > */ > unsigned int vm_lock_seq; > - /* Unstable RCU readers are allowed to read this. */ > - struct vma_lock *vm_lock; > #endif > > /* > @@ -770,6 +768,10 @@ struct vm_area_struct { > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > +#ifdef CONFIG_PER_VMA_LOCK > + /* Unstable RCU readers are allowed to read this. */ > + struct vma_lock vm_lock ____cacheline_aligned_in_smp; > +#endif > } __randomize_layout; Do we just want 'struct vm_area_struct' to be cacheline aligned or do we want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote: > > Back when per-vma locks were introduces, vm_lock was moved out of > > vm_area_struct in [1] because of the performance regression caused by > > false cacheline sharing. Recent investigation [2] revealed that the > > regressions is limited to a rather old Broadwell microarchitecture and > > even there it can be mitigated by disabling adjacent cacheline > > prefetching, see [3]. > > Splitting single logical structure into multiple ones leads to more > > complicated management, extra pointer dereferences and overall less > > maintainable code. When that split-away part is a lock, it complicates > > things even further. With no performance benefits, there are no reasons > > for this split. Merging the vm_lock back into vm_area_struct also allows > > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. > > Move vm_lock back into vm_area_struct, aligning it at the cacheline > > boundary and changing the cache to be cacheline-aligned as well. > > With kernel compiled using defconfig, this causes VMA memory consumption > > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: > > > > slabinfo before: > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > vma_lock ... 40 102 1 : ... > > vm_area_struct ... 160 51 2 : ... > > > > slabinfo after moving vm_lock: > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > vm_area_struct ... 256 32 2 : ... > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, > > which is 5.5MB per 100000 VMAs. Note that the size of this structure is > > dependent on the kernel configuration and typically the original size is > > higher than 160 bytes. Therefore these calculations are close to the > > worst case scenario. A more realistic vm_area_struct usage before this > > change is: > > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > vma_lock ... 40 102 1 : ... > > vm_area_struct ... 176 46 2 : ... > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, > > which is 3.9MB per 100000 VMAs. > > This memory consumption growth can be addressed later by optimizing the > > vm_lock. > > > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ > > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ > > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> Thanks! > > > One question below. > > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -716,8 +716,6 @@ struct vm_area_struct { > > * slowpath. > > */ > > unsigned int vm_lock_seq; > > - /* Unstable RCU readers are allowed to read this. */ > > - struct vma_lock *vm_lock; > > #endif > > > > /* > > @@ -770,6 +768,10 @@ struct vm_area_struct { > > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > > #endif > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > +#ifdef CONFIG_PER_VMA_LOCK > > + /* Unstable RCU readers are allowed to read this. */ > > + struct vma_lock vm_lock ____cacheline_aligned_in_smp; > > +#endif > > } __randomize_layout; > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we > want 'struct vma_lock vm_lock' to be on a separate cacheline as well? We want both to minimize cacheline sharing. >
On Wed, Nov 20, 2024 at 03:44:29PM -0800, Suren Baghdasaryan wrote: > On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote: > > > Back when per-vma locks were introduces, vm_lock was moved out of > > > vm_area_struct in [1] because of the performance regression caused by > > > false cacheline sharing. Recent investigation [2] revealed that the > > > regressions is limited to a rather old Broadwell microarchitecture and > > > even there it can be mitigated by disabling adjacent cacheline > > > prefetching, see [3]. > > > Splitting single logical structure into multiple ones leads to more > > > complicated management, extra pointer dereferences and overall less > > > maintainable code. When that split-away part is a lock, it complicates > > > things even further. With no performance benefits, there are no reasons > > > for this split. Merging the vm_lock back into vm_area_struct also allows > > > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. > > > Move vm_lock back into vm_area_struct, aligning it at the cacheline > > > boundary and changing the cache to be cacheline-aligned as well. > > > With kernel compiled using defconfig, this causes VMA memory consumption > > > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: > > > > > > slabinfo before: > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > vma_lock ... 40 102 1 : ... > > > vm_area_struct ... 160 51 2 : ... > > > > > > slabinfo after moving vm_lock: > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > vm_area_struct ... 256 32 2 : ... > > > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, > > > which is 5.5MB per 100000 VMAs. Note that the size of this structure is > > > dependent on the kernel configuration and typically the original size is > > > higher than 160 bytes. Therefore these calculations are close to the > > > worst case scenario. A more realistic vm_area_struct usage before this > > > change is: > > > > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > vma_lock ... 40 102 1 : ... > > > vm_area_struct ... 176 46 2 : ... > > > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, > > > which is 3.9MB per 100000 VMAs. > > > This memory consumption growth can be addressed later by optimizing the > > > vm_lock. > > > > > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ > > > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ > > > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> > > Thanks! > > > > > > > One question below. > > > > > --- a/include/linux/mm_types.h > > > +++ b/include/linux/mm_types.h > > > @@ -716,8 +716,6 @@ struct vm_area_struct { > > > * slowpath. > > > */ > > > unsigned int vm_lock_seq; > > > - /* Unstable RCU readers are allowed to read this. */ > > > - struct vma_lock *vm_lock; > > > #endif > > > > > > /* > > > @@ -770,6 +768,10 @@ struct vm_area_struct { > > > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > > > #endif > > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > > +#ifdef CONFIG_PER_VMA_LOCK > > > + /* Unstable RCU readers are allowed to read this. */ > > > + struct vma_lock vm_lock ____cacheline_aligned_in_smp; > > > +#endif > > > } __randomize_layout; > > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well? > > We want both to minimize cacheline sharing. > For later, you will need to add a pad after vm_lock as well, so any future addition will not share the cacheline with vm_lock. I would do something like below. This is a nit and can be done later. diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7654c766cbe2..5cc4fff163a0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -751,10 +751,12 @@ struct vm_area_struct { #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; #ifdef CONFIG_PER_VMA_LOCK + CACHELINE_PADDING(__pad1__); /* Unstable RCU readers are allowed to read this. */ - struct vma_lock vm_lock ____cacheline_aligned_in_smp; + struct vma_lock vm_lock; + CACHELINE_PADDING(__pad2__); #endif -} __randomize_layout; +} __randomize_layout ____cacheline_aligned_in_smp; #ifdef CONFIG_NUMA #define vma_policy(vma) ((vma)->vm_policy)
On Wed, Nov 20, 2024 at 4:05 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > On Wed, Nov 20, 2024 at 03:44:29PM -0800, Suren Baghdasaryan wrote: > > On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > > > > > On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote: > > > > Back when per-vma locks were introduces, vm_lock was moved out of > > > > vm_area_struct in [1] because of the performance regression caused by > > > > false cacheline sharing. Recent investigation [2] revealed that the > > > > regressions is limited to a rather old Broadwell microarchitecture and > > > > even there it can be mitigated by disabling adjacent cacheline > > > > prefetching, see [3]. > > > > Splitting single logical structure into multiple ones leads to more > > > > complicated management, extra pointer dereferences and overall less > > > > maintainable code. When that split-away part is a lock, it complicates > > > > things even further. With no performance benefits, there are no reasons > > > > for this split. Merging the vm_lock back into vm_area_struct also allows > > > > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset. > > > > Move vm_lock back into vm_area_struct, aligning it at the cacheline > > > > boundary and changing the cache to be cacheline-aligned as well. > > > > With kernel compiled using defconfig, this causes VMA memory consumption > > > > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes: > > > > > > > > slabinfo before: > > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > > vma_lock ... 40 102 1 : ... > > > > vm_area_struct ... 160 51 2 : ... > > > > > > > > slabinfo after moving vm_lock: > > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > > vm_area_struct ... 256 32 2 : ... > > > > > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages, > > > > which is 5.5MB per 100000 VMAs. Note that the size of this structure is > > > > dependent on the kernel configuration and typically the original size is > > > > higher than 160 bytes. Therefore these calculations are close to the > > > > worst case scenario. A more realistic vm_area_struct usage before this > > > > change is: > > > > > > > > <name> ... <objsize> <objperslab> <pagesperslab> : ... > > > > vma_lock ... 40 102 1 : ... > > > > vm_area_struct ... 176 46 2 : ... > > > > > > > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages, > > > > which is 3.9MB per 100000 VMAs. > > > > This memory consumption growth can be addressed later by optimizing the > > > > vm_lock. > > > > > > > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/ > > > > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/ > > > > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/ > > > > > > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com> > > > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> > > > > > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev> > > > > Thanks! > > > > > > > > > > > One question below. > > > > > > > --- a/include/linux/mm_types.h > > > > +++ b/include/linux/mm_types.h > > > > @@ -716,8 +716,6 @@ struct vm_area_struct { > > > > * slowpath. > > > > */ > > > > unsigned int vm_lock_seq; > > > > - /* Unstable RCU readers are allowed to read this. */ > > > > - struct vma_lock *vm_lock; > > > > #endif > > > > > > > > /* > > > > @@ -770,6 +768,10 @@ struct vm_area_struct { > > > > struct vma_numab_state *numab_state; /* NUMA Balancing state */ > > > > #endif > > > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > > > +#ifdef CONFIG_PER_VMA_LOCK > > > > + /* Unstable RCU readers are allowed to read this. */ > > > > + struct vma_lock vm_lock ____cacheline_aligned_in_smp; > > > > +#endif > > > > } __randomize_layout; > > > > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we > > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well? > > > > We want both to minimize cacheline sharing. > > > > For later, you will need to add a pad after vm_lock as well, so any > future addition will not share the cacheline with vm_lock. I would do > something like below. This is a nit and can be done later. > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > index 7654c766cbe2..5cc4fff163a0 100644 > --- a/include/linux/mm_types.h > +++ b/include/linux/mm_types.h > @@ -751,10 +751,12 @@ struct vm_area_struct { > #endif > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > #ifdef CONFIG_PER_VMA_LOCK > + CACHELINE_PADDING(__pad1__); > /* Unstable RCU readers are allowed to read this. */ > - struct vma_lock vm_lock ____cacheline_aligned_in_smp; > + struct vma_lock vm_lock; > + CACHELINE_PADDING(__pad2__); > #endif > -} __randomize_layout; > +} __randomize_layout ____cacheline_aligned_in_smp; I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch would have the same result, no? > > #ifdef CONFIG_NUMA > #define vma_policy(vma) ((vma)->vm_policy)
On Wed, Nov 20, 2024 at 04:33:37PM -0800, Suren Baghdasaryan wrote: [...] > > > > > > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we > > > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well? > > > > > > We want both to minimize cacheline sharing. > > > > > > > For later, you will need to add a pad after vm_lock as well, so any > > future addition will not share the cacheline with vm_lock. I would do > > something like below. This is a nit and can be done later. > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > index 7654c766cbe2..5cc4fff163a0 100644 > > --- a/include/linux/mm_types.h > > +++ b/include/linux/mm_types.h > > @@ -751,10 +751,12 @@ struct vm_area_struct { > > #endif > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > #ifdef CONFIG_PER_VMA_LOCK > > + CACHELINE_PADDING(__pad1__); > > /* Unstable RCU readers are allowed to read this. */ > > - struct vma_lock vm_lock ____cacheline_aligned_in_smp; > > + struct vma_lock vm_lock; > > + CACHELINE_PADDING(__pad2__); > > #endif > > -} __randomize_layout; > > +} __randomize_layout ____cacheline_aligned_in_smp; > > I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch > would have the same result, no? > SLAB_HWCACHE_ALIGN is more about the slub allocator allocating cache aligned memory. It does not say anything about the internals of the struct for which the kmem_cache is being created. The ____cacheline_aligned_in_smp tag in your patch made sure that the field vm_lock will be put in a new cacheline and there can be a hole between vm_lock and the previous field if the previous field is not ending at the cacheline boundary. Please note that if you add a new field after vm_lock (without cacheline alignment tag), it will be on the same cacheline as vm_lock. So, your code is achieving the vm_lock on its own cacheline goal but vm_lock being the only field on that cacheline is not being achieved.
On Wed, Nov 20, 2024 at 11:02 PM Shakeel Butt <shakeel.butt@linux.dev> wrote: > > On Wed, Nov 20, 2024 at 04:33:37PM -0800, Suren Baghdasaryan wrote: > [...] > > > > > > > > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we > > > > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well? > > > > > > > > We want both to minimize cacheline sharing. > > > > > > > > > > For later, you will need to add a pad after vm_lock as well, so any > > > future addition will not share the cacheline with vm_lock. I would do > > > something like below. This is a nit and can be done later. > > > > > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h > > > index 7654c766cbe2..5cc4fff163a0 100644 > > > --- a/include/linux/mm_types.h > > > +++ b/include/linux/mm_types.h > > > @@ -751,10 +751,12 @@ struct vm_area_struct { > > > #endif > > > struct vm_userfaultfd_ctx vm_userfaultfd_ctx; > > > #ifdef CONFIG_PER_VMA_LOCK > > > + CACHELINE_PADDING(__pad1__); > > > /* Unstable RCU readers are allowed to read this. */ > > > - struct vma_lock vm_lock ____cacheline_aligned_in_smp; > > > + struct vma_lock vm_lock; > > > + CACHELINE_PADDING(__pad2__); > > > #endif > > > -} __randomize_layout; > > > +} __randomize_layout ____cacheline_aligned_in_smp; > > > > I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch > > would have the same result, no? > > > > SLAB_HWCACHE_ALIGN is more about the slub allocator allocating cache > aligned memory. It does not say anything about the internals of the > struct for which the kmem_cache is being created. The > ____cacheline_aligned_in_smp tag in your patch made sure that the field > vm_lock will be put in a new cacheline and there can be a hole between > vm_lock and the previous field if the previous field is not ending at > the cacheline boundary. Please note that if you add a new field after > vm_lock (without cacheline alignment tag), it will be on the same > cacheline as vm_lock. So, your code is achieving the vm_lock on its own > cacheline goal but vm_lock being the only field on that cacheline is not > being achieved. Sorry, I should have been more clear. It's ok if some fields which are rarely accessed in the pagefault path are placed in the same cacheling with vm_lock. In fact I've done that to pack them better in the previous version of this patchset here: https://lore.kernel.org/all/20241111205506.3404479-5-surenb@google.com/ (removed for now based on the feedback). So, vm_lock being the only field on the cacheline is not my goal. After this patchset I'm planning to try packing the members better and save some memory. >
On Thu, Nov 21, 2024 at 09:05:21AM -0800, Suren Baghdasaryan wrote: [...] > > Sorry, I should have been more clear. It's ok if some fields which are > rarely accessed in the pagefault path are placed in the same cacheling > with vm_lock. In fact I've done that to pack them better in the > previous version of this patchset here: > https://lore.kernel.org/all/20241111205506.3404479-5-surenb@google.com/ > (removed for now based on the feedback). So, vm_lock being the only > field on the cacheline is not my goal. After this patchset I'm > planning to try packing the members better and save some memory. > Nah, my bad, somehow I thought you want vm_lock to be on a cacheline alone.
diff --git a/include/linux/mm.h b/include/linux/mm.h index 1ba2e480ae63..737c003b0a1e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -684,6 +684,12 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_PER_VMA_LOCK +static inline void vma_lock_init(struct vm_area_struct *vma) +{ + init_rwsem(&vma->vm_lock.lock); + vma->vm_lock_seq = UINT_MAX; +} + /* * Try to read-lock a vma. The function is allowed to occasionally yield false * locked result to avoid performance overhead, in which case we fall back to @@ -701,7 +707,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) return false; - if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0)) + if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) return false; /* @@ -716,7 +722,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) * This pairs with RELEASE semantics in vma_end_write_all(). */ if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); return false; } return true; @@ -731,7 +737,7 @@ static inline bool vma_start_read(struct vm_area_struct *vma) static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) { mmap_assert_locked(vma->vm_mm); - down_read_nested(&vma->vm_lock->lock, subclass); + down_read_nested(&vma->vm_lock.lock, subclass); } /* @@ -743,13 +749,13 @@ static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int static inline void vma_start_read_locked(struct vm_area_struct *vma) { mmap_assert_locked(vma->vm_mm); - down_read(&vma->vm_lock->lock); + down_read(&vma->vm_lock.lock); } static inline void vma_end_read(struct vm_area_struct *vma) { rcu_read_lock(); /* keeps vma alive till the end of up_read */ - up_read(&vma->vm_lock->lock); + up_read(&vma->vm_lock.lock); rcu_read_unlock(); } @@ -778,7 +784,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) if (__is_vma_write_locked(vma, &mm_lock_seq)) return; - down_write(&vma->vm_lock->lock); + down_write(&vma->vm_lock.lock); /* * We should use WRITE_ONCE() here because we can have concurrent reads * from the early lockless pessimistic check in vma_start_read(). @@ -786,7 +792,7 @@ static inline void vma_start_write(struct vm_area_struct *vma) * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. */ WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); - up_write(&vma->vm_lock->lock); + up_write(&vma->vm_lock.lock); } static inline void vma_assert_write_locked(struct vm_area_struct *vma) @@ -798,7 +804,7 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) static inline void vma_assert_locked(struct vm_area_struct *vma) { - if (!rwsem_is_locked(&vma->vm_lock->lock)) + if (!rwsem_is_locked(&vma->vm_lock.lock)) vma_assert_write_locked(vma); } @@ -831,6 +837,7 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, #else /* CONFIG_PER_VMA_LOCK */ +static inline void vma_lock_init(struct vm_area_struct *vma) {} static inline bool vma_start_read(struct vm_area_struct *vma) { return false; } static inline void vma_end_read(struct vm_area_struct *vma) {} @@ -865,10 +872,6 @@ static inline void assert_fault_locked(struct vm_fault *vmf) extern const struct vm_operations_struct vma_dummy_vm_ops; -/* - * WARNING: vma_init does not initialize vma->vm_lock. - * Use vm_area_alloc()/vm_area_free() if vma needs locking. - */ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { memset(vma, 0, sizeof(*vma)); @@ -877,6 +880,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); vma_numab_state_init(vma); + vma_lock_init(vma); } /* Use when VMA is not part of the VMA tree and needs no locking */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 80fef38d9d64..5c4bfdcfac72 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -716,8 +716,6 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - /* Unstable RCU readers are allowed to read this. */ - struct vma_lock *vm_lock; #endif /* @@ -770,6 +768,10 @@ struct vm_area_struct { struct vma_numab_state *numab_state; /* NUMA Balancing state */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; +#ifdef CONFIG_PER_VMA_LOCK + /* Unstable RCU readers are allowed to read this. */ + struct vma_lock vm_lock ____cacheline_aligned_in_smp; +#endif } __randomize_layout; #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index 0061cf2450ef..7823797e31d2 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -436,35 +436,6 @@ static struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -#ifdef CONFIG_PER_VMA_LOCK - -/* SLAB cache for vm_area_struct.lock */ -static struct kmem_cache *vma_lock_cachep; - -static bool vma_lock_alloc(struct vm_area_struct *vma) -{ - vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL); - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); - vma->vm_lock_seq = UINT_MAX; - - return true; -} - -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - kmem_cache_free(vma_lock_cachep, vma->vm_lock); -} - -#else /* CONFIG_PER_VMA_LOCK */ - -static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; } -static inline void vma_lock_free(struct vm_area_struct *vma) {} - -#endif /* CONFIG_PER_VMA_LOCK */ - struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { struct vm_area_struct *vma; @@ -474,10 +445,6 @@ struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - kmem_cache_free(vm_area_cachep, vma); - return NULL; - } return vma; } @@ -496,10 +463,7 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) * will be reinitialized. */ data_race(memcpy(new, orig, sizeof(*new))); - if (!vma_lock_alloc(new)) { - kmem_cache_free(vm_area_cachep, new); - return NULL; - } + vma_lock_init(new); INIT_LIST_HEAD(&new->anon_vma_chain); vma_numab_state_init(new); dup_anon_vma_name(orig, new); @@ -511,7 +475,6 @@ void __vm_area_free(struct vm_area_struct *vma) { vma_numab_state_free(vma); free_anon_vma_name(vma); - vma_lock_free(vma); kmem_cache_free(vm_area_cachep, vma); } @@ -522,7 +485,7 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) vm_rcu); /* The vma should not be locked while being destroyed. */ - VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma); + VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); __vm_area_free(vma); } #endif @@ -3168,11 +3131,9 @@ void __init proc_caches_init(void) sizeof(struct fs_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); - - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); -#ifdef CONFIG_PER_VMA_LOCK - vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT); -#endif + vm_area_cachep = KMEM_CACHE(vm_area_struct, + SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC| + SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index 1d9fc97b8e80..11c2c38ca4e8 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -230,10 +230,10 @@ struct vm_area_struct { /* * Can only be written (using WRITE_ONCE()) while holding both: * - mmap_lock (in write mode) - * - vm_lock->lock (in write mode) + * - vm_lock.lock (in write mode) * Can be read reliably while holding one of: * - mmap_lock (in read or write mode) - * - vm_lock->lock (in read or write mode) + * - vm_lock.lock (in read or write mode) * Can be read unreliably (using READ_ONCE()) for pessimistic bailout * while holding nothing (except RCU to keep the VMA struct allocated). * @@ -242,7 +242,7 @@ struct vm_area_struct { * slowpath. */ unsigned int vm_lock_seq; - struct vma_lock *vm_lock; + struct vma_lock vm_lock; #endif /* @@ -408,17 +408,10 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) return mas_find(&vmi->mas, ULONG_MAX); } -static inline bool vma_lock_alloc(struct vm_area_struct *vma) +static inline void vma_lock_init(struct vm_area_struct *vma) { - vma->vm_lock = calloc(1, sizeof(struct vma_lock)); - - if (!vma->vm_lock) - return false; - - init_rwsem(&vma->vm_lock->lock); + init_rwsem(&vma->vm_lock.lock); vma->vm_lock_seq = UINT_MAX; - - return true; } static inline void vma_assert_write_locked(struct vm_area_struct *); @@ -439,6 +432,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) vma->vm_ops = &vma_dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); vma_mark_detached(vma, false); + vma_lock_init(vma); } static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) @@ -449,10 +443,6 @@ static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) return NULL; vma_init(vma, mm); - if (!vma_lock_alloc(vma)) { - free(vma); - return NULL; - } return vma; } @@ -465,10 +455,7 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) return NULL; memcpy(new, orig, sizeof(*new)); - if (!vma_lock_alloc(new)) { - free(new); - return NULL; - } + vma_lock_init(new); INIT_LIST_HEAD(&new->anon_vma_chain); return new; @@ -638,14 +625,8 @@ static inline void mpol_put(struct mempolicy *) { } -static inline void vma_lock_free(struct vm_area_struct *vma) -{ - free(vma->vm_lock); -} - static inline void __vm_area_free(struct vm_area_struct *vma) { - vma_lock_free(vma); free(vma); }