diff mbox series

[v4,2/5] mm: move per-vma lock into vm_area_struct

Message ID 20241120000826.335387-3-surenb@google.com (mailing list archive)
State New
Headers show
Series move per-vma lock into vm_area_struct | expand

Commit Message

Suren Baghdasaryan Nov. 20, 2024, 12:08 a.m. UTC
Back when per-vma locks were introduces, vm_lock was moved out of
vm_area_struct in [1] because of the performance regression caused by
false cacheline sharing. Recent investigation [2] revealed that the
regressions is limited to a rather old Broadwell microarchitecture and
even there it can be mitigated by disabling adjacent cacheline
prefetching, see [3].
Splitting single logical structure into multiple ones leads to more
complicated management, extra pointer dereferences and overall less
maintainable code. When that split-away part is a lock, it complicates
things even further. With no performance benefits, there are no reasons
for this split. Merging the vm_lock back into vm_area_struct also allows
vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
Move vm_lock back into vm_area_struct, aligning it at the cacheline
boundary and changing the cache to be cacheline-aligned as well.
With kernel compiled using defconfig, this causes VMA memory consumption
to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:

    slabinfo before:
     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vma_lock         ...     40  102    1 : ...
     vm_area_struct   ...    160   51    2 : ...

    slabinfo after moving vm_lock:
     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vm_area_struct   ...    256   32    2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
which is 5.5MB per 100000 VMAs. Note that the size of this structure is
dependent on the kernel configuration and typically the original size is
higher than 160 bytes. Therefore these calculations are close to the
worst case scenario. A more realistic vm_area_struct usage before this
change is:

     <name>           ... <objsize> <objperslab> <pagesperslab> : ...
     vma_lock         ...     40  102    1 : ...
     vm_area_struct   ...    176   46    2 : ...

Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
which is 3.9MB per 100000 VMAs.
This memory consumption growth can be addressed later by optimizing the
vm_lock.

[1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
[2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
[3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
---
 include/linux/mm.h               | 28 ++++++++++--------
 include/linux/mm_types.h         |  6 ++--
 kernel/fork.c                    | 49 ++++----------------------------
 tools/testing/vma/vma_internal.h | 33 +++++----------------
 4 files changed, 32 insertions(+), 84 deletions(-)

Comments

Shakeel Butt Nov. 20, 2024, 11:32 p.m. UTC | #1
On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote:
> Back when per-vma locks were introduces, vm_lock was moved out of
> vm_area_struct in [1] because of the performance regression caused by
> false cacheline sharing. Recent investigation [2] revealed that the
> regressions is limited to a rather old Broadwell microarchitecture and
> even there it can be mitigated by disabling adjacent cacheline
> prefetching, see [3].
> Splitting single logical structure into multiple ones leads to more
> complicated management, extra pointer dereferences and overall less
> maintainable code. When that split-away part is a lock, it complicates
> things even further. With no performance benefits, there are no reasons
> for this split. Merging the vm_lock back into vm_area_struct also allows
> vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
> Move vm_lock back into vm_area_struct, aligning it at the cacheline
> boundary and changing the cache to be cacheline-aligned as well.
> With kernel compiled using defconfig, this causes VMA memory consumption
> to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
> 
>     slabinfo before:
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vma_lock         ...     40  102    1 : ...
>      vm_area_struct   ...    160   51    2 : ...
> 
>     slabinfo after moving vm_lock:
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vm_area_struct   ...    256   32    2 : ...
> 
> Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> which is 5.5MB per 100000 VMAs. Note that the size of this structure is
> dependent on the kernel configuration and typically the original size is
> higher than 160 bytes. Therefore these calculations are close to the
> worst case scenario. A more realistic vm_area_struct usage before this
> change is:
> 
>      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
>      vma_lock         ...     40  102    1 : ...
>      vm_area_struct   ...    176   46    2 : ...
> 
> Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> which is 3.9MB per 100000 VMAs.
> This memory consumption growth can be addressed later by optimizing the
> vm_lock.
> 
> [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
> 
> Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>

Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

One question below.

> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -716,8 +716,6 @@ struct vm_area_struct {
>  	 * slowpath.
>  	 */
>  	unsigned int vm_lock_seq;
> -	/* Unstable RCU readers are allowed to read this. */
> -	struct vma_lock *vm_lock;
>  #endif
>  
>  	/*
> @@ -770,6 +768,10 @@ struct vm_area_struct {
>  	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
>  #endif
>  	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> +#ifdef CONFIG_PER_VMA_LOCK
> +	/* Unstable RCU readers are allowed to read this. */
> +	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> +#endif
>  } __randomize_layout;

Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
Suren Baghdasaryan Nov. 20, 2024, 11:44 p.m. UTC | #2
On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote:
> > Back when per-vma locks were introduces, vm_lock was moved out of
> > vm_area_struct in [1] because of the performance regression caused by
> > false cacheline sharing. Recent investigation [2] revealed that the
> > regressions is limited to a rather old Broadwell microarchitecture and
> > even there it can be mitigated by disabling adjacent cacheline
> > prefetching, see [3].
> > Splitting single logical structure into multiple ones leads to more
> > complicated management, extra pointer dereferences and overall less
> > maintainable code. When that split-away part is a lock, it complicates
> > things even further. With no performance benefits, there are no reasons
> > for this split. Merging the vm_lock back into vm_area_struct also allows
> > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
> > Move vm_lock back into vm_area_struct, aligning it at the cacheline
> > boundary and changing the cache to be cacheline-aligned as well.
> > With kernel compiled using defconfig, this causes VMA memory consumption
> > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
> >
> >     slabinfo before:
> >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> >      vma_lock         ...     40  102    1 : ...
> >      vm_area_struct   ...    160   51    2 : ...
> >
> >     slabinfo after moving vm_lock:
> >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> >      vm_area_struct   ...    256   32    2 : ...
> >
> > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> > which is 5.5MB per 100000 VMAs. Note that the size of this structure is
> > dependent on the kernel configuration and typically the original size is
> > higher than 160 bytes. Therefore these calculations are close to the
> > worst case scenario. A more realistic vm_area_struct usage before this
> > change is:
> >
> >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> >      vma_lock         ...     40  102    1 : ...
> >      vm_area_struct   ...    176   46    2 : ...
> >
> > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> > which is 3.9MB per 100000 VMAs.
> > This memory consumption growth can be addressed later by optimizing the
> > vm_lock.
> >
> > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
> >
> > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
>
> Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>

Thanks!

>
>
> One question below.
>
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -716,8 +716,6 @@ struct vm_area_struct {
> >        * slowpath.
> >        */
> >       unsigned int vm_lock_seq;
> > -     /* Unstable RCU readers are allowed to read this. */
> > -     struct vma_lock *vm_lock;
> >  #endif
> >
> >       /*
> > @@ -770,6 +768,10 @@ struct vm_area_struct {
> >       struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> >  #endif
> >       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > +#ifdef CONFIG_PER_VMA_LOCK
> > +     /* Unstable RCU readers are allowed to read this. */
> > +     struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> > +#endif
> >  } __randomize_layout;
>
> Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
> want 'struct vma_lock vm_lock' to be on a separate cacheline as well?

We want both to minimize cacheline sharing.

>
Shakeel Butt Nov. 21, 2024, 12:04 a.m. UTC | #3
On Wed, Nov 20, 2024 at 03:44:29PM -0800, Suren Baghdasaryan wrote:
> On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> >
> > On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote:
> > > Back when per-vma locks were introduces, vm_lock was moved out of
> > > vm_area_struct in [1] because of the performance regression caused by
> > > false cacheline sharing. Recent investigation [2] revealed that the
> > > regressions is limited to a rather old Broadwell microarchitecture and
> > > even there it can be mitigated by disabling adjacent cacheline
> > > prefetching, see [3].
> > > Splitting single logical structure into multiple ones leads to more
> > > complicated management, extra pointer dereferences and overall less
> > > maintainable code. When that split-away part is a lock, it complicates
> > > things even further. With no performance benefits, there are no reasons
> > > for this split. Merging the vm_lock back into vm_area_struct also allows
> > > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
> > > Move vm_lock back into vm_area_struct, aligning it at the cacheline
> > > boundary and changing the cache to be cacheline-aligned as well.
> > > With kernel compiled using defconfig, this causes VMA memory consumption
> > > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
> > >
> > >     slabinfo before:
> > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > >      vma_lock         ...     40  102    1 : ...
> > >      vm_area_struct   ...    160   51    2 : ...
> > >
> > >     slabinfo after moving vm_lock:
> > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > >      vm_area_struct   ...    256   32    2 : ...
> > >
> > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> > > which is 5.5MB per 100000 VMAs. Note that the size of this structure is
> > > dependent on the kernel configuration and typically the original size is
> > > higher than 160 bytes. Therefore these calculations are close to the
> > > worst case scenario. A more realistic vm_area_struct usage before this
> > > change is:
> > >
> > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > >      vma_lock         ...     40  102    1 : ...
> > >      vm_area_struct   ...    176   46    2 : ...
> > >
> > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> > > which is 3.9MB per 100000 VMAs.
> > > This memory consumption growth can be addressed later by optimizing the
> > > vm_lock.
> > >
> > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> > > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> > > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
> > >
> > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> >
> > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> 
> Thanks!
> 
> >
> >
> > One question below.
> >
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -716,8 +716,6 @@ struct vm_area_struct {
> > >        * slowpath.
> > >        */
> > >       unsigned int vm_lock_seq;
> > > -     /* Unstable RCU readers are allowed to read this. */
> > > -     struct vma_lock *vm_lock;
> > >  #endif
> > >
> > >       /*
> > > @@ -770,6 +768,10 @@ struct vm_area_struct {
> > >       struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > >  #endif
> > >       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > +#ifdef CONFIG_PER_VMA_LOCK
> > > +     /* Unstable RCU readers are allowed to read this. */
> > > +     struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> > > +#endif
> > >  } __randomize_layout;
> >
> > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
> > want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
> 
> We want both to minimize cacheline sharing.
> 

For later, you will need to add a pad after vm_lock as well, so any
future addition will not share the cacheline with vm_lock. I would do
something like below. This is a nit and can be done later.

diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7654c766cbe2..5cc4fff163a0 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -751,10 +751,12 @@ struct vm_area_struct {
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
 #ifdef CONFIG_PER_VMA_LOCK
+	CACHELINE_PADDING(__pad1__);
 	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+	struct vma_lock vm_lock;
+	CACHELINE_PADDING(__pad2__);
 #endif
-} __randomize_layout;
+} __randomize_layout ____cacheline_aligned_in_smp;
 
 #ifdef CONFIG_NUMA
 #define vma_policy(vma) ((vma)->vm_policy)
Suren Baghdasaryan Nov. 21, 2024, 12:33 a.m. UTC | #4
On Wed, Nov 20, 2024 at 4:05 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, Nov 20, 2024 at 03:44:29PM -0800, Suren Baghdasaryan wrote:
> > On Wed, Nov 20, 2024 at 3:33 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
> > >
> > > On Tue, Nov 19, 2024 at 04:08:23PM -0800, Suren Baghdasaryan wrote:
> > > > Back when per-vma locks were introduces, vm_lock was moved out of
> > > > vm_area_struct in [1] because of the performance regression caused by
> > > > false cacheline sharing. Recent investigation [2] revealed that the
> > > > regressions is limited to a rather old Broadwell microarchitecture and
> > > > even there it can be mitigated by disabling adjacent cacheline
> > > > prefetching, see [3].
> > > > Splitting single logical structure into multiple ones leads to more
> > > > complicated management, extra pointer dereferences and overall less
> > > > maintainable code. When that split-away part is a lock, it complicates
> > > > things even further. With no performance benefits, there are no reasons
> > > > for this split. Merging the vm_lock back into vm_area_struct also allows
> > > > vm_area_struct to use SLAB_TYPESAFE_BY_RCU later in this patchset.
> > > > Move vm_lock back into vm_area_struct, aligning it at the cacheline
> > > > boundary and changing the cache to be cacheline-aligned as well.
> > > > With kernel compiled using defconfig, this causes VMA memory consumption
> > > > to grow from 160 (vm_area_struct) + 40 (vm_lock) bytes to 256 bytes:
> > > >
> > > >     slabinfo before:
> > > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > > >      vma_lock         ...     40  102    1 : ...
> > > >      vm_area_struct   ...    160   51    2 : ...
> > > >
> > > >     slabinfo after moving vm_lock:
> > > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > > >      vm_area_struct   ...    256   32    2 : ...
> > > >
> > > > Aggregate VMA memory consumption per 1000 VMAs grows from 50 to 64 pages,
> > > > which is 5.5MB per 100000 VMAs. Note that the size of this structure is
> > > > dependent on the kernel configuration and typically the original size is
> > > > higher than 160 bytes. Therefore these calculations are close to the
> > > > worst case scenario. A more realistic vm_area_struct usage before this
> > > > change is:
> > > >
> > > >      <name>           ... <objsize> <objperslab> <pagesperslab> : ...
> > > >      vma_lock         ...     40  102    1 : ...
> > > >      vm_area_struct   ...    176   46    2 : ...
> > > >
> > > > Aggregate VMA memory consumption per 1000 VMAs grows from 54 to 64 pages,
> > > > which is 3.9MB per 100000 VMAs.
> > > > This memory consumption growth can be addressed later by optimizing the
> > > > vm_lock.
> > > >
> > > > [1] https://lore.kernel.org/all/20230227173632.3292573-34-surenb@google.com/
> > > > [2] https://lore.kernel.org/all/ZsQyI%2F087V34JoIt@xsang-OptiPlex-9020/
> > > > [3] https://lore.kernel.org/all/CAJuCfpEisU8Lfe96AYJDZ+OM4NoPmnw9bP53cT_kbfP_pR+-2g@mail.gmail.com/
> > > >
> > > > Signed-off-by: Suren Baghdasaryan <surenb@google.com>
> > > > Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
> > >
> > > Reviewed-by: Shakeel Butt <shakeel.butt@linux.dev>
> >
> > Thanks!
> >
> > >
> > >
> > > One question below.
> > >
> > > > --- a/include/linux/mm_types.h
> > > > +++ b/include/linux/mm_types.h
> > > > @@ -716,8 +716,6 @@ struct vm_area_struct {
> > > >        * slowpath.
> > > >        */
> > > >       unsigned int vm_lock_seq;
> > > > -     /* Unstable RCU readers are allowed to read this. */
> > > > -     struct vma_lock *vm_lock;
> > > >  #endif
> > > >
> > > >       /*
> > > > @@ -770,6 +768,10 @@ struct vm_area_struct {
> > > >       struct vma_numab_state *numab_state;    /* NUMA Balancing state */
> > > >  #endif
> > > >       struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > > > +#ifdef CONFIG_PER_VMA_LOCK
> > > > +     /* Unstable RCU readers are allowed to read this. */
> > > > +     struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> > > > +#endif
> > > >  } __randomize_layout;
> > >
> > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
> > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
> >
> > We want both to minimize cacheline sharing.
> >
>
> For later, you will need to add a pad after vm_lock as well, so any
> future addition will not share the cacheline with vm_lock. I would do
> something like below. This is a nit and can be done later.
>
> diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> index 7654c766cbe2..5cc4fff163a0 100644
> --- a/include/linux/mm_types.h
> +++ b/include/linux/mm_types.h
> @@ -751,10 +751,12 @@ struct vm_area_struct {
>  #endif
>         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
>  #ifdef CONFIG_PER_VMA_LOCK
> +       CACHELINE_PADDING(__pad1__);
>         /* Unstable RCU readers are allowed to read this. */
> -       struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> +       struct vma_lock vm_lock;
> +       CACHELINE_PADDING(__pad2__);
>  #endif
> -} __randomize_layout;
> +} __randomize_layout ____cacheline_aligned_in_smp;

I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch
would have the same result, no?

>
>  #ifdef CONFIG_NUMA
>  #define vma_policy(vma) ((vma)->vm_policy)
Shakeel Butt Nov. 21, 2024, 7:01 a.m. UTC | #5
On Wed, Nov 20, 2024 at 04:33:37PM -0800, Suren Baghdasaryan wrote:
[...]
> > > >
> > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
> > > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
> > >
> > > We want both to minimize cacheline sharing.
> > >
> >
> > For later, you will need to add a pad after vm_lock as well, so any
> > future addition will not share the cacheline with vm_lock. I would do
> > something like below. This is a nit and can be done later.
> >
> > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > index 7654c766cbe2..5cc4fff163a0 100644
> > --- a/include/linux/mm_types.h
> > +++ b/include/linux/mm_types.h
> > @@ -751,10 +751,12 @@ struct vm_area_struct {
> >  #endif
> >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> >  #ifdef CONFIG_PER_VMA_LOCK
> > +       CACHELINE_PADDING(__pad1__);
> >         /* Unstable RCU readers are allowed to read this. */
> > -       struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> > +       struct vma_lock vm_lock;
> > +       CACHELINE_PADDING(__pad2__);
> >  #endif
> > -} __randomize_layout;
> > +} __randomize_layout ____cacheline_aligned_in_smp;
> 
> I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch
> would have the same result, no?
> 

SLAB_HWCACHE_ALIGN is more about the slub allocator allocating cache
aligned memory. It does not say anything about the internals of the
struct for which the kmem_cache is being created. The
____cacheline_aligned_in_smp tag in your patch made sure that the field
vm_lock will be put in a new cacheline and there can be a hole between
vm_lock and the previous field if the previous field is not ending at
the cacheline boundary. Please note that if you add a new field after
vm_lock (without cacheline alignment tag), it will be on the same
cacheline as vm_lock. So, your code is achieving the vm_lock on its own
cacheline goal but vm_lock being the only field on that cacheline is not
being achieved.
Suren Baghdasaryan Nov. 21, 2024, 5:05 p.m. UTC | #6
On Wed, Nov 20, 2024 at 11:02 PM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Wed, Nov 20, 2024 at 04:33:37PM -0800, Suren Baghdasaryan wrote:
> [...]
> > > > >
> > > > > Do we just want 'struct vm_area_struct' to be cacheline aligned or do we
> > > > > want 'struct vma_lock vm_lock' to be on a separate cacheline as well?
> > > >
> > > > We want both to minimize cacheline sharing.
> > > >
> > >
> > > For later, you will need to add a pad after vm_lock as well, so any
> > > future addition will not share the cacheline with vm_lock. I would do
> > > something like below. This is a nit and can be done later.
> > >
> > > diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
> > > index 7654c766cbe2..5cc4fff163a0 100644
> > > --- a/include/linux/mm_types.h
> > > +++ b/include/linux/mm_types.h
> > > @@ -751,10 +751,12 @@ struct vm_area_struct {
> > >  #endif
> > >         struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
> > >  #ifdef CONFIG_PER_VMA_LOCK
> > > +       CACHELINE_PADDING(__pad1__);
> > >         /* Unstable RCU readers are allowed to read this. */
> > > -       struct vma_lock vm_lock ____cacheline_aligned_in_smp;
> > > +       struct vma_lock vm_lock;
> > > +       CACHELINE_PADDING(__pad2__);
> > >  #endif
> > > -} __randomize_layout;
> > > +} __randomize_layout ____cacheline_aligned_in_smp;
> >
> > I thought SLAB_HWCACHE_ALIGN for vm_area_cachep added in this patch
> > would have the same result, no?
> >
>
> SLAB_HWCACHE_ALIGN is more about the slub allocator allocating cache
> aligned memory. It does not say anything about the internals of the
> struct for which the kmem_cache is being created. The
> ____cacheline_aligned_in_smp tag in your patch made sure that the field
> vm_lock will be put in a new cacheline and there can be a hole between
> vm_lock and the previous field if the previous field is not ending at
> the cacheline boundary. Please note that if you add a new field after
> vm_lock (without cacheline alignment tag), it will be on the same
> cacheline as vm_lock. So, your code is achieving the vm_lock on its own
> cacheline goal but vm_lock being the only field on that cacheline is not
> being achieved.

Sorry, I should have been more clear. It's ok if some fields which are
rarely accessed in the pagefault path are placed in the same cacheling
with vm_lock. In fact I've done that to pack them better in the
previous version of this patchset here:
https://lore.kernel.org/all/20241111205506.3404479-5-surenb@google.com/
(removed for now based on the feedback). So, vm_lock being the only
field on the cacheline is not my goal. After this patchset I'm
planning to try packing the members better and save some memory.

>
Shakeel Butt Nov. 21, 2024, 6:25 p.m. UTC | #7
On Thu, Nov 21, 2024 at 09:05:21AM -0800, Suren Baghdasaryan wrote:
[...]
> 
> Sorry, I should have been more clear. It's ok if some fields which are
> rarely accessed in the pagefault path are placed in the same cacheling
> with vm_lock. In fact I've done that to pack them better in the
> previous version of this patchset here:
> https://lore.kernel.org/all/20241111205506.3404479-5-surenb@google.com/
> (removed for now based on the feedback). So, vm_lock being the only
> field on the cacheline is not my goal. After this patchset I'm
> planning to try packing the members better and save some memory.
> 

Nah, my bad, somehow I thought you want vm_lock to be on a cacheline
alone.
diff mbox series

Patch

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ba2e480ae63..737c003b0a1e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -684,6 +684,12 @@  static inline void vma_numab_state_free(struct vm_area_struct *vma) {}
 #endif /* CONFIG_NUMA_BALANCING */
 
 #ifdef CONFIG_PER_VMA_LOCK
+static inline void vma_lock_init(struct vm_area_struct *vma)
+{
+	init_rwsem(&vma->vm_lock.lock);
+	vma->vm_lock_seq = UINT_MAX;
+}
+
 /*
  * Try to read-lock a vma. The function is allowed to occasionally yield false
  * locked result to avoid performance overhead, in which case we fall back to
@@ -701,7 +707,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence))
 		return false;
 
-	if (unlikely(down_read_trylock(&vma->vm_lock->lock) == 0))
+	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0))
 		return false;
 
 	/*
@@ -716,7 +722,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 	 * This pairs with RELEASE semantics in vma_end_write_all().
 	 */
 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) {
-		up_read(&vma->vm_lock->lock);
+		up_read(&vma->vm_lock.lock);
 		return false;
 	}
 	return true;
@@ -731,7 +737,7 @@  static inline bool vma_start_read(struct vm_area_struct *vma)
 static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass)
 {
 	mmap_assert_locked(vma->vm_mm);
-	down_read_nested(&vma->vm_lock->lock, subclass);
+	down_read_nested(&vma->vm_lock.lock, subclass);
 }
 
 /*
@@ -743,13 +749,13 @@  static inline void vma_start_read_locked_nested(struct vm_area_struct *vma, int
 static inline void vma_start_read_locked(struct vm_area_struct *vma)
 {
 	mmap_assert_locked(vma->vm_mm);
-	down_read(&vma->vm_lock->lock);
+	down_read(&vma->vm_lock.lock);
 }
 
 static inline void vma_end_read(struct vm_area_struct *vma)
 {
 	rcu_read_lock(); /* keeps vma alive till the end of up_read */
-	up_read(&vma->vm_lock->lock);
+	up_read(&vma->vm_lock.lock);
 	rcu_read_unlock();
 }
 
@@ -778,7 +784,7 @@  static inline void vma_start_write(struct vm_area_struct *vma)
 	if (__is_vma_write_locked(vma, &mm_lock_seq))
 		return;
 
-	down_write(&vma->vm_lock->lock);
+	down_write(&vma->vm_lock.lock);
 	/*
 	 * We should use WRITE_ONCE() here because we can have concurrent reads
 	 * from the early lockless pessimistic check in vma_start_read().
@@ -786,7 +792,7 @@  static inline void vma_start_write(struct vm_area_struct *vma)
 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy.
 	 */
 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq);
-	up_write(&vma->vm_lock->lock);
+	up_write(&vma->vm_lock.lock);
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *vma)
@@ -798,7 +804,7 @@  static inline void vma_assert_write_locked(struct vm_area_struct *vma)
 
 static inline void vma_assert_locked(struct vm_area_struct *vma)
 {
-	if (!rwsem_is_locked(&vma->vm_lock->lock))
+	if (!rwsem_is_locked(&vma->vm_lock.lock))
 		vma_assert_write_locked(vma);
 }
 
@@ -831,6 +837,7 @@  struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm,
 
 #else /* CONFIG_PER_VMA_LOCK */
 
+static inline void vma_lock_init(struct vm_area_struct *vma) {}
 static inline bool vma_start_read(struct vm_area_struct *vma)
 		{ return false; }
 static inline void vma_end_read(struct vm_area_struct *vma) {}
@@ -865,10 +872,6 @@  static inline void assert_fault_locked(struct vm_fault *vmf)
 
 extern const struct vm_operations_struct vma_dummy_vm_ops;
 
-/*
- * WARNING: vma_init does not initialize vma->vm_lock.
- * Use vm_area_alloc()/vm_area_free() if vma needs locking.
- */
 static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 {
 	memset(vma, 0, sizeof(*vma));
@@ -877,6 +880,7 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
 	vma_numab_state_init(vma);
+	vma_lock_init(vma);
 }
 
 /* Use when VMA is not part of the VMA tree and needs no locking */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 80fef38d9d64..5c4bfdcfac72 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -716,8 +716,6 @@  struct vm_area_struct {
 	 * slowpath.
 	 */
 	unsigned int vm_lock_seq;
-	/* Unstable RCU readers are allowed to read this. */
-	struct vma_lock *vm_lock;
 #endif
 
 	/*
@@ -770,6 +768,10 @@  struct vm_area_struct {
 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */
 #endif
 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
+#ifdef CONFIG_PER_VMA_LOCK
+	/* Unstable RCU readers are allowed to read this. */
+	struct vma_lock vm_lock ____cacheline_aligned_in_smp;
+#endif
 } __randomize_layout;
 
 #ifdef CONFIG_NUMA
diff --git a/kernel/fork.c b/kernel/fork.c
index 0061cf2450ef..7823797e31d2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -436,35 +436,6 @@  static struct kmem_cache *vm_area_cachep;
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
 
-#ifdef CONFIG_PER_VMA_LOCK
-
-/* SLAB cache for vm_area_struct.lock */
-static struct kmem_cache *vma_lock_cachep;
-
-static bool vma_lock_alloc(struct vm_area_struct *vma)
-{
-	vma->vm_lock = kmem_cache_alloc(vma_lock_cachep, GFP_KERNEL);
-	if (!vma->vm_lock)
-		return false;
-
-	init_rwsem(&vma->vm_lock->lock);
-	vma->vm_lock_seq = UINT_MAX;
-
-	return true;
-}
-
-static inline void vma_lock_free(struct vm_area_struct *vma)
-{
-	kmem_cache_free(vma_lock_cachep, vma->vm_lock);
-}
-
-#else /* CONFIG_PER_VMA_LOCK */
-
-static inline bool vma_lock_alloc(struct vm_area_struct *vma) { return true; }
-static inline void vma_lock_free(struct vm_area_struct *vma) {}
-
-#endif /* CONFIG_PER_VMA_LOCK */
-
 struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 {
 	struct vm_area_struct *vma;
@@ -474,10 +445,6 @@  struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 		return NULL;
 
 	vma_init(vma, mm);
-	if (!vma_lock_alloc(vma)) {
-		kmem_cache_free(vm_area_cachep, vma);
-		return NULL;
-	}
 
 	return vma;
 }
@@ -496,10 +463,7 @@  struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 	 * will be reinitialized.
 	 */
 	data_race(memcpy(new, orig, sizeof(*new)));
-	if (!vma_lock_alloc(new)) {
-		kmem_cache_free(vm_area_cachep, new);
-		return NULL;
-	}
+	vma_lock_init(new);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
 	vma_numab_state_init(new);
 	dup_anon_vma_name(orig, new);
@@ -511,7 +475,6 @@  void __vm_area_free(struct vm_area_struct *vma)
 {
 	vma_numab_state_free(vma);
 	free_anon_vma_name(vma);
-	vma_lock_free(vma);
 	kmem_cache_free(vm_area_cachep, vma);
 }
 
@@ -522,7 +485,7 @@  static void vm_area_free_rcu_cb(struct rcu_head *head)
 						  vm_rcu);
 
 	/* The vma should not be locked while being destroyed. */
-	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock->lock), vma);
+	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma);
 	__vm_area_free(vma);
 }
 #endif
@@ -3168,11 +3131,9 @@  void __init proc_caches_init(void)
 			sizeof(struct fs_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
 			NULL);
-
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
-#ifdef CONFIG_PER_VMA_LOCK
-	vma_lock_cachep = KMEM_CACHE(vma_lock, SLAB_PANIC|SLAB_ACCOUNT);
-#endif
+	vm_area_cachep = KMEM_CACHE(vm_area_struct,
+			SLAB_HWCACHE_ALIGN|SLAB_NO_MERGE|SLAB_PANIC|
+			SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
 }
diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h
index 1d9fc97b8e80..11c2c38ca4e8 100644
--- a/tools/testing/vma/vma_internal.h
+++ b/tools/testing/vma/vma_internal.h
@@ -230,10 +230,10 @@  struct vm_area_struct {
 	/*
 	 * Can only be written (using WRITE_ONCE()) while holding both:
 	 *  - mmap_lock (in write mode)
-	 *  - vm_lock->lock (in write mode)
+	 *  - vm_lock.lock (in write mode)
 	 * Can be read reliably while holding one of:
 	 *  - mmap_lock (in read or write mode)
-	 *  - vm_lock->lock (in read or write mode)
+	 *  - vm_lock.lock (in read or write mode)
 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout
 	 * while holding nothing (except RCU to keep the VMA struct allocated).
 	 *
@@ -242,7 +242,7 @@  struct vm_area_struct {
 	 * slowpath.
 	 */
 	unsigned int vm_lock_seq;
-	struct vma_lock *vm_lock;
+	struct vma_lock vm_lock;
 #endif
 
 	/*
@@ -408,17 +408,10 @@  static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi)
 	return mas_find(&vmi->mas, ULONG_MAX);
 }
 
-static inline bool vma_lock_alloc(struct vm_area_struct *vma)
+static inline void vma_lock_init(struct vm_area_struct *vma)
 {
-	vma->vm_lock = calloc(1, sizeof(struct vma_lock));
-
-	if (!vma->vm_lock)
-		return false;
-
-	init_rwsem(&vma->vm_lock->lock);
+	init_rwsem(&vma->vm_lock.lock);
 	vma->vm_lock_seq = UINT_MAX;
-
-	return true;
 }
 
 static inline void vma_assert_write_locked(struct vm_area_struct *);
@@ -439,6 +432,7 @@  static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
 	vma->vm_ops = &vma_dummy_vm_ops;
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma_mark_detached(vma, false);
+	vma_lock_init(vma);
 }
 
 static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
@@ -449,10 +443,6 @@  static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
 		return NULL;
 
 	vma_init(vma, mm);
-	if (!vma_lock_alloc(vma)) {
-		free(vma);
-		return NULL;
-	}
 
 	return vma;
 }
@@ -465,10 +455,7 @@  static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 		return NULL;
 
 	memcpy(new, orig, sizeof(*new));
-	if (!vma_lock_alloc(new)) {
-		free(new);
-		return NULL;
-	}
+	vma_lock_init(new);
 	INIT_LIST_HEAD(&new->anon_vma_chain);
 
 	return new;
@@ -638,14 +625,8 @@  static inline void mpol_put(struct mempolicy *)
 {
 }
 
-static inline void vma_lock_free(struct vm_area_struct *vma)
-{
-	free(vma->vm_lock);
-}
-
 static inline void __vm_area_free(struct vm_area_struct *vma)
 {
-	vma_lock_free(vma);
 	free(vma);
 }