diff mbox series

[v2,2/2] treewide: Add the __GFP_PACKED flag to several non-DMA kmalloc() allocations

Message ID 20221025205247.3264568-3-catalin.marinas@arm.com (mailing list archive)
State New, archived
Headers show
Series mm: Allow kmalloc() allocations below ARCH_KMALLOC_MINALIGN | expand

Commit Message

Catalin Marinas Oct. 25, 2022, 8:52 p.m. UTC
Using the ftrace kmalloc histogram shows several hot spots for small
memory allocations that would benefit from the smaller
KMALLOC_PACKED_ALIGN alignment.

To set up the ftrace events for small kmalloc() allocations (only
showing those not already having the __GFP_PACKED flag):

  echo 'hist:key=call_site.sym-offset:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending if bytes_req<=96 && !(gfp_flags&0x8000000)' \
	> /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger

To enable tracing of boot-time allocations, use the following
bootconfig:

  ftrace.event.kmem.kmalloc.hist {
  	keys = call_site.sym-offset
  	values = bytes_req, bytes_alloc
  	sort = bytes_alloc.descending
  	filter = "bytes_req <= 96 && !(gfp_flags & 0x8000000)"
  }

To view the allocation hot spots:

  head /sys/kernel/debug/tracing/events/kmem/kmalloc/hist

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/usb/core/message.c           | 3 ++-
 fs/binfmt_elf.c                      | 6 ++++--
 fs/dcache.c                          | 3 ++-
 fs/ext4/dir.c                        | 4 ++--
 fs/ext4/extents.c                    | 4 ++--
 fs/file.c                            | 2 +-
 fs/kernfs/file.c                     | 8 ++++----
 fs/nfs/dir.c                         | 7 ++++---
 fs/nfs/inode.c                       | 2 +-
 fs/nfs/nfs4state.c                   | 2 +-
 fs/nfs/write.c                       | 3 ++-
 fs/notify/inotify/inotify_fsnotify.c | 3 ++-
 fs/proc/self.c                       | 2 +-
 fs/seq_file.c                        | 5 +++--
 include/acpi/platform/aclinuxex.h    | 6 ++++--
 kernel/trace/ftrace.c                | 2 +-
 kernel/trace/tracing_map.c           | 2 +-
 lib/kasprintf.c                      | 2 +-
 lib/kobject.c                        | 4 ++--
 lib/mpi/mpiutil.c                    | 2 +-
 mm/list_lru.c                        | 6 ++++--
 mm/memcontrol.c                      | 4 ++--
 mm/util.c                            | 6 +++---
 mm/vmalloc.c                         | 6 ++++--
 net/sunrpc/auth_unix.c               | 2 +-
 net/sunrpc/xdr.c                     | 2 +-
 security/apparmor/lsm.c              | 2 +-
 security/security.c                  | 4 ++--
 security/tomoyo/realpath.c           | 2 +-
 29 files changed, 60 insertions(+), 46 deletions(-)

Comments

Greg KH Oct. 26, 2022, 6:50 a.m. UTC | #1
On Tue, Oct 25, 2022 at 09:52:47PM +0100, Catalin Marinas wrote:
> Using the ftrace kmalloc histogram shows several hot spots for small
> memory allocations that would benefit from the smaller
> KMALLOC_PACKED_ALIGN alignment.
> 
> To set up the ftrace events for small kmalloc() allocations (only
> showing those not already having the __GFP_PACKED flag):
> 
>   echo 'hist:key=call_site.sym-offset:vals=bytes_req,bytes_alloc:sort=bytes_alloc.descending if bytes_req<=96 && !(gfp_flags&0x8000000)' \
> 	> /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
> 
> To enable tracing of boot-time allocations, use the following
> bootconfig:
> 
>   ftrace.event.kmem.kmalloc.hist {
>   	keys = call_site.sym-offset
>   	values = bytes_req, bytes_alloc
>   	sort = bytes_alloc.descending
>   	filter = "bytes_req <= 96 && !(gfp_flags & 0x8000000)"
>   }
> 
> To view the allocation hot spots:
> 
>   head /sys/kernel/debug/tracing/events/kmem/kmalloc/hist
> 
> Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
> ---
>  drivers/usb/core/message.c           | 3 ++-
>  fs/binfmt_elf.c                      | 6 ++++--
>  fs/dcache.c                          | 3 ++-
>  fs/ext4/dir.c                        | 4 ++--
>  fs/ext4/extents.c                    | 4 ++--
>  fs/file.c                            | 2 +-
>  fs/kernfs/file.c                     | 8 ++++----
>  fs/nfs/dir.c                         | 7 ++++---
>  fs/nfs/inode.c                       | 2 +-
>  fs/nfs/nfs4state.c                   | 2 +-
>  fs/nfs/write.c                       | 3 ++-
>  fs/notify/inotify/inotify_fsnotify.c | 3 ++-
>  fs/proc/self.c                       | 2 +-
>  fs/seq_file.c                        | 5 +++--
>  include/acpi/platform/aclinuxex.h    | 6 ++++--
>  kernel/trace/ftrace.c                | 2 +-
>  kernel/trace/tracing_map.c           | 2 +-
>  lib/kasprintf.c                      | 2 +-
>  lib/kobject.c                        | 4 ++--
>  lib/mpi/mpiutil.c                    | 2 +-
>  mm/list_lru.c                        | 6 ++++--
>  mm/memcontrol.c                      | 4 ++--
>  mm/util.c                            | 6 +++---
>  mm/vmalloc.c                         | 6 ++++--
>  net/sunrpc/auth_unix.c               | 2 +-
>  net/sunrpc/xdr.c                     | 2 +-
>  security/apparmor/lsm.c              | 2 +-
>  security/security.c                  | 4 ++--
>  security/tomoyo/realpath.c           | 2 +-
>  29 files changed, 60 insertions(+), 46 deletions(-)
> 
> diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
> index 4d59d927ae3e..bff8901dc426 100644
> --- a/drivers/usb/core/message.c
> +++ b/drivers/usb/core/message.c
> @@ -525,7 +525,8 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
>  	}
>  
>  	/* initialize all the urbs we'll use */
> -	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs), mem_flags);
> +	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs),
> +				 mem_flags | __GFP_PACKED);

Hey look, you just did what I was worried about :(

Oh wait, no, this is just the urb structure, not the actual data pointer
sent on the urb.

Ick, this is going to get really hard to review over time.  I feel for
the need to want to start to pack things in smaller, but this is going
to be really really hard for maintainers to review submissions.
Especially if the only way problems show up is in random platforms where
the alignment causes a failure.

Anyway we can make all arches fail if we submit pointers that are not
aligned properly to the DMA engines?

>  	if (!io->urbs)
>  		goto nomem;
>  
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 63c7ebb0da89..e5ad1a3244fb 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -883,7 +883,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
>  			goto out_free_ph;
>  
>  		retval = -ENOMEM;
> -		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
> +		elf_interpreter = kmalloc(elf_ppnt->p_filesz,
> +					  GFP_KERNEL | __GFP_PACKED);

If this is going to now be sprinkled all over the tree, why not make it
a "real" flag, "GFP_PACKED"?  Or better yet, have it describe the
allocation better, "GFP_TINY" or
"GFP_I_KNOW_THIS_IS_TINY_WITH_NO_DMA_ISSUES"

> --- a/lib/kasprintf.c
> +++ b/lib/kasprintf.c
> @@ -22,7 +22,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
>  	first = vsnprintf(NULL, 0, fmt, aq);
>  	va_end(aq);
>  
> -	p = kmalloc_track_caller(first+1, gfp);
> +	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);

How do we know this is going to be small?

>  	if (!p)
>  		return NULL;
>  
> diff --git a/lib/kobject.c b/lib/kobject.c
> index a0b2dbfcfa23..2c4acb36925d 100644
> --- a/lib/kobject.c
> +++ b/lib/kobject.c
> @@ -144,7 +144,7 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
>  	len = get_kobj_path_length(kobj);
>  	if (len == 0)
>  		return NULL;
> -	path = kzalloc(len, gfp_mask);
> +	path = kzalloc(len, gfp_mask | __GFP_PACKED);

This might not be small, and it's going to be very very short-lived
(within a single function call), why does it need to be allocated this
way?

>  	if (!path)
>  		return NULL;
>  	fill_kobj_path(kobj, path, len);
> @@ -749,7 +749,7 @@ static struct kobject *kobject_create(void)
>  {
>  	struct kobject *kobj;
>  
> -	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
> +	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL | __GFP_PACKED);

This might make sense, but what type of size are you wanting to see
these packed allocations require?  This is not all that tiny, but I
guess it falls under the 96 bytes?  Where did that number come from?

thanks,

greg k-h
Catalin Marinas Oct. 26, 2022, 9:48 a.m. UTC | #2
On Wed, Oct 26, 2022 at 08:50:07AM +0200, Greg Kroah-Hartman wrote:
> On Tue, Oct 25, 2022 at 09:52:47PM +0100, Catalin Marinas wrote:
> > diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
> > index 4d59d927ae3e..bff8901dc426 100644
> > --- a/drivers/usb/core/message.c
> > +++ b/drivers/usb/core/message.c
> > @@ -525,7 +525,8 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
> >  	}
> >  
> >  	/* initialize all the urbs we'll use */
> > -	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs), mem_flags);
> > +	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs),
> > +				 mem_flags | __GFP_PACKED);
> 
> Hey look, you just did what I was worried about :(
> 
> Oh wait, no, this is just the urb structure, not the actual data pointer
> sent on the urb.

Yeah, I agree it gets tricky to review such patches. My hope is that
driver writers won't start adding such flags everywhere, only where
there's a significant number of allocations. Better flag naming would
make this more obvious.

> Ick, this is going to get really hard to review over time.  I feel for
> the need to want to start to pack things in smaller, but this is going
> to be really really hard for maintainers to review submissions.
> Especially if the only way problems show up is in random platforms where
> the alignment causes a failure.
> 
> Anyway we can make all arches fail if we submit pointers that are not
> aligned properly to the DMA engines?

As I mentioned in a prior reply, we could add a warning comparing the
slab size with ARCH_DMA_MINALIGN (or cache_line_size(), though the
latter could trigger on fully-coherent architectures).

> > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> > index 63c7ebb0da89..e5ad1a3244fb 100644
> > --- a/fs/binfmt_elf.c
> > +++ b/fs/binfmt_elf.c
> > @@ -883,7 +883,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
> >  			goto out_free_ph;
> >  
> >  		retval = -ENOMEM;
> > -		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
> > +		elf_interpreter = kmalloc(elf_ppnt->p_filesz,
> > +					  GFP_KERNEL | __GFP_PACKED);
> 
> If this is going to now be sprinkled all over the tree, why not make it
> a "real" flag, "GFP_PACKED"?  Or better yet, have it describe the
> allocation better, "GFP_TINY" or
> "GFP_I_KNOW_THIS_IS_TINY_WITH_NO_DMA_ISSUES"

Linus originally suggested GFP_NODMA. We could go with that (and a
corresponding KMALLOC_NODMA_MINALIGN).

> > --- a/lib/kasprintf.c
> > +++ b/lib/kasprintf.c
> > @@ -22,7 +22,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
> >  	first = vsnprintf(NULL, 0, fmt, aq);
> >  	va_end(aq);
> >  
> > -	p = kmalloc_track_caller(first+1, gfp);
> > +	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);
> 
> How do we know this is going to be small?

We don't need to know it's small. If it's over 96 bytes on arm64, it
goes in the kmalloc-128 cache or higher. It can even use the kmalloc-192
cache that's not aligned to ARCH_DMA_MINALIGN (128). That's why I'd
avoid GFP_TINY as this flag is not about size but rather alignment (e.g.
192 may not be DMA safe but it's larger than 128).

That said, I should try to identify sizes > 128 and <= 192 and pass such
flag.

> > diff --git a/lib/kobject.c b/lib/kobject.c
> > index a0b2dbfcfa23..2c4acb36925d 100644
> > --- a/lib/kobject.c
> > +++ b/lib/kobject.c
> > @@ -144,7 +144,7 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
> >  	len = get_kobj_path_length(kobj);
> >  	if (len == 0)
> >  		return NULL;
> > -	path = kzalloc(len, gfp_mask);
> > +	path = kzalloc(len, gfp_mask | __GFP_PACKED);
> 
> This might not be small, and it's going to be very very short-lived
> (within a single function call), why does it need to be allocated this
> way?

Regarding short-lived objects, you are right, they won't affect
slabinfo. My ftrace-fu is not great, I only looked at the allocation
hits and they keep adding up without counting how many are
freed. So maybe we need tracing free() as well but not always easy to
match against the allocation point and infer how many live objects there
are.

> > @@ -749,7 +749,7 @@ static struct kobject *kobject_create(void)
> >  {
> >  	struct kobject *kobj;
> >  
> > -	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
> > +	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL | __GFP_PACKED);
> 
> This might make sense, but what type of size are you wanting to see
> these packed allocations require?  This is not all that tiny, but I
> guess it falls under the 96 bytes?  Where did that number come from?

With ARCH_DMA_MINALIGN of 128, normally we'd only have kmalloc-192,
kmalloc-256 and higher caches. With this patch I'd like to enable
kmalloc-{8,16,32,64,96,192}. So 96 is the size below 128 that would go
into a smaller kmalloc cache (and I missed 192 in my ftrace histogram).
Greg KH Oct. 26, 2022, 12:59 p.m. UTC | #3
On Wed, Oct 26, 2022 at 10:48:58AM +0100, Catalin Marinas wrote:
> On Wed, Oct 26, 2022 at 08:50:07AM +0200, Greg Kroah-Hartman wrote:
> > On Tue, Oct 25, 2022 at 09:52:47PM +0100, Catalin Marinas wrote:
> > > diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
> > > index 4d59d927ae3e..bff8901dc426 100644
> > > --- a/drivers/usb/core/message.c
> > > +++ b/drivers/usb/core/message.c
> > > @@ -525,7 +525,8 @@ int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
> > >  	}
> > >  
> > >  	/* initialize all the urbs we'll use */
> > > -	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs), mem_flags);
> > > +	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs),
> > > +				 mem_flags | __GFP_PACKED);
> > 
> > Hey look, you just did what I was worried about :(
> > 
> > Oh wait, no, this is just the urb structure, not the actual data pointer
> > sent on the urb.
> 
> Yeah, I agree it gets tricky to review such patches. My hope is that
> driver writers won't start adding such flags everywhere, only where
> there's a significant number of allocations. Better flag naming would
> make this more obvious.

You need to give both driver writers, and reviewers, hints as to what
you are expecting to see happen here.  Where should, and should we not,
use this flag?  I can't really tell here as I don't understand the goal
by just looking at the flag itself.

If I see "packed", of course I want to use packed, why would a driver
writer NOT want it?  But that name does not give any hint on when it
would NOT be good to use, which is a big flaw in the naming.

> > Ick, this is going to get really hard to review over time.  I feel for
> > the need to want to start to pack things in smaller, but this is going
> > to be really really hard for maintainers to review submissions.
> > Especially if the only way problems show up is in random platforms where
> > the alignment causes a failure.
> > 
> > Anyway we can make all arches fail if we submit pointers that are not
> > aligned properly to the DMA engines?
> 
> As I mentioned in a prior reply, we could add a warning comparing the
> slab size with ARCH_DMA_MINALIGN (or cache_line_size(), though the
> latter could trigger on fully-coherent architectures).

It should trigger on all arches in order to get proper coverage.  That's
the only way we have fixed these types of bugs up over time.

> > > diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> > > index 63c7ebb0da89..e5ad1a3244fb 100644
> > > --- a/fs/binfmt_elf.c
> > > +++ b/fs/binfmt_elf.c
> > > @@ -883,7 +883,8 @@ static int load_elf_binary(struct linux_binprm *bprm)
> > >  			goto out_free_ph;
> > >  
> > >  		retval = -ENOMEM;
> > > -		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
> > > +		elf_interpreter = kmalloc(elf_ppnt->p_filesz,
> > > +					  GFP_KERNEL | __GFP_PACKED);
> > 
> > If this is going to now be sprinkled all over the tree, why not make it
> > a "real" flag, "GFP_PACKED"?  Or better yet, have it describe the
> > allocation better, "GFP_TINY" or
> > "GFP_I_KNOW_THIS_IS_TINY_WITH_NO_DMA_ISSUES"
> 
> Linus originally suggested GFP_NODMA. We could go with that (and a
> corresponding KMALLOC_NODMA_MINALIGN).

NODMA is good, it gives me a better idea of when it would be able to be
used (i.e. not in a driver.)

> > > --- a/lib/kasprintf.c
> > > +++ b/lib/kasprintf.c
> > > @@ -22,7 +22,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
> > >  	first = vsnprintf(NULL, 0, fmt, aq);
> > >  	va_end(aq);
> > >  
> > > -	p = kmalloc_track_caller(first+1, gfp);
> > > +	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);
> > 
> > How do we know this is going to be small?
> 
> We don't need to know it's small. If it's over 96 bytes on arm64, it
> goes in the kmalloc-128 cache or higher. It can even use the kmalloc-192
> cache that's not aligned to ARCH_DMA_MINALIGN (128). That's why I'd
> avoid GFP_TINY as this flag is not about size but rather alignment (e.g.
> 192 may not be DMA safe but it's larger than 128).
> 
> That said, I should try to identify sizes > 128 and <= 192 and pass such
> flag.

What if the flag is used for large sizes, what will happen?  In other
words, why would you ever NOT want to use this?  DMA is a big issue, but
then we should flip that around and explicitly mark the times we want
DMA, not not-want DMA as "not want" is by far the most common, right?

In other words, I'm leaning hard on the "mark allocations that need DMA
memory as needing DMA memory" option.  Yes it will be more work
initially, but I bet a lot if not most, of them can be caught with
coccinelle scripts.  And then enforced with sparse to ensure they don't
break in the future.

That would provide a huge hint to the allocator as to what is needed,
while this "packed" really doesn't express the intent here at all.  Then
if your allocator/arch has explicit requirements for DMA memory, it can
enforce them and not just hope and guess that people mark things as "not
dma".

> > > diff --git a/lib/kobject.c b/lib/kobject.c
> > > index a0b2dbfcfa23..2c4acb36925d 100644
> > > --- a/lib/kobject.c
> > > +++ b/lib/kobject.c
> > > @@ -144,7 +144,7 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
> > >  	len = get_kobj_path_length(kobj);
> > >  	if (len == 0)
> > >  		return NULL;
> > > -	path = kzalloc(len, gfp_mask);
> > > +	path = kzalloc(len, gfp_mask | __GFP_PACKED);
> > 
> > This might not be small, and it's going to be very very short-lived
> > (within a single function call), why does it need to be allocated this
> > way?
> 
> Regarding short-lived objects, you are right, they won't affect
> slabinfo. My ftrace-fu is not great, I only looked at the allocation
> hits and they keep adding up without counting how many are
> freed. So maybe we need tracing free() as well but not always easy to
> match against the allocation point and infer how many live objects there
> are.

This code path at boot time is yes, called a lot, but the path is
created/destroyed instantly.  There should not ever be any allocation
here that lasts more than one function call.  So you might want to look
at structures that remain in the slabs after booting, not just initial
boot allocation calls.  If you were to want to try to instrument this in
this way.  But I think that's a loosing game, let's do it right and mark
memory for DMA as such and keep track of it.

Bonus is that this will help tracking memory like this as potentially
"untrusted" if it comes from hardware, but that's a totally different
issue, and one that we don't need to worry about today, perhaps in a
year or so...

thanks,

greg k-h
Catalin Marinas Oct. 26, 2022, 5:09 p.m. UTC | #4
On Wed, Oct 26, 2022 at 02:59:04PM +0200, Greg Kroah-Hartman wrote:
> On Wed, Oct 26, 2022 at 10:48:58AM +0100, Catalin Marinas wrote:
> > On Wed, Oct 26, 2022 at 08:50:07AM +0200, Greg Kroah-Hartman wrote:
> > > On Tue, Oct 25, 2022 at 09:52:47PM +0100, Catalin Marinas wrote:
> > > > --- a/lib/kasprintf.c
> > > > +++ b/lib/kasprintf.c
> > > > @@ -22,7 +22,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
> > > >  	first = vsnprintf(NULL, 0, fmt, aq);
> > > >  	va_end(aq);
> > > >  
> > > > -	p = kmalloc_track_caller(first+1, gfp);
> > > > +	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);
> > > 
> > > How do we know this is going to be small?
> > 
> > We don't need to know it's small. If it's over 96 bytes on arm64, it
> > goes in the kmalloc-128 cache or higher. It can even use the kmalloc-192
> > cache that's not aligned to ARCH_DMA_MINALIGN (128). That's why I'd
> > avoid GFP_TINY as this flag is not about size but rather alignment (e.g.
> > 192 may not be DMA safe but it's larger than 128).
> > 
> > That said, I should try to identify sizes > 128 and <= 192 and pass such
> > flag.
> 
> What if the flag is used for large sizes, what will happen?  In other
> words, why would you ever NOT want to use this?  DMA is a big issue, but
> then we should flip that around and explicitly mark the times we want
> DMA, not not-want DMA as "not want" is by far the most common, right?

Indeed, flipping these flags is the ideal solution. It's just tracking
them down and I'm not sure coccinelle on its own can handle it (maybe it
does). As an example of what needs changing:

----------------------8<-------------------------
diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h
index fc1ba2a3e6fb..8ba94d563db3 100644
--- a/drivers/infiniband/hw/irdma/osdep.h
+++ b/drivers/infiniband/hw/irdma/osdep.h
@@ -16,7 +16,7 @@ struct irdma_dma_info {
 };

 struct irdma_dma_mem {
-	void *va;
+	void __dma *va;
 	dma_addr_t pa;
 	u32 size;
 } __packed;
diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
index 4ec9639f1bdb..ab15c5e812d0 100644
--- a/drivers/infiniband/hw/irdma/puda.c
+++ b/drivers/infiniband/hw/irdma/puda.c
@@ -143,13 +143,13 @@ static struct irdma_puda_buf *irdma_puda_alloc_buf(struct irdma_sc_dev *dev,
 	struct irdma_virt_mem buf_mem;

 	buf_mem.size = sizeof(struct irdma_puda_buf);
-	buf_mem.va = kzalloc(buf_mem.size, GFP_KERNEL);
+	buf_mem.va = dma_kzalloc(buf_mem.size, GFP_KERNEL);
 	if (!buf_mem.va)
 		return NULL;

 	buf = buf_mem.va;
 	buf->mem.size = len;
-	buf->mem.va = kzalloc(buf->mem.size, GFP_KERNEL);
+	buf->mem.va = dma_kzalloc(buf->mem.size, GFP_KERNEL);
 	if (!buf->mem.va)
 		goto free_virt;
 	buf->mem.pa = dma_map_single(dev->hw->device, buf->mem.va,
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 0ee20b764000..8476e6609f35 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -324,7 +324,7 @@ static inline void dma_free_noncoherent(struct device *dev, size_t size,
 	dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
 }

-static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
+static inline dma_addr_t dma_map_single_attrs(struct device *dev, void __dma *ptr,
 		size_t size, enum dma_data_direction dir, unsigned long attrs)
 {
 	/* DMA must never operate on areas that might be remapped. */
----------------------8<-------------------------

Basically any pointer type passed to dma_map_single() would need the
__dma attribute. Once that's done, the next step is changing the
allocator from kmalloc() to a new dma_kmalloc(). There are other places
where the pointer gets assigned the value of another pointer (e.g.
skb->data), so the origin pointer need to inherit the __dma attribute
(and its original allocator changed).

The scatterlist API may need changing slightly as it works on pages +
offsets.

> In other words, I'm leaning hard on the "mark allocations that need DMA
> memory as needing DMA memory" option.  Yes it will be more work
> initially, but I bet a lot if not most, of them can be caught with
> coccinelle scripts.  And then enforced with sparse to ensure they don't
> break in the future.

I'll read a bit more on this.

> That would provide a huge hint to the allocator as to what is needed,
> while this "packed" really doesn't express the intent here at all.  Then
> if your allocator/arch has explicit requirements for DMA memory, it can
> enforce them and not just hope and guess that people mark things as "not
> dma".

"Packed" is more about saying "I don't need the ARCH_KMALLOC_MINALIGN
alignment". That's why I went for it instead of GFP_NODMA (which usually
leads to your question of why not doing it the other way around).
Greg KH Oct. 26, 2022, 5:21 p.m. UTC | #5
On Wed, Oct 26, 2022 at 06:09:19PM +0100, Catalin Marinas wrote:
> On Wed, Oct 26, 2022 at 02:59:04PM +0200, Greg Kroah-Hartman wrote:
> > On Wed, Oct 26, 2022 at 10:48:58AM +0100, Catalin Marinas wrote:
> > > On Wed, Oct 26, 2022 at 08:50:07AM +0200, Greg Kroah-Hartman wrote:
> > > > On Tue, Oct 25, 2022 at 09:52:47PM +0100, Catalin Marinas wrote:
> > > > > --- a/lib/kasprintf.c
> > > > > +++ b/lib/kasprintf.c
> > > > > @@ -22,7 +22,7 @@ char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
> > > > >  	first = vsnprintf(NULL, 0, fmt, aq);
> > > > >  	va_end(aq);
> > > > >  
> > > > > -	p = kmalloc_track_caller(first+1, gfp);
> > > > > +	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);
> > > > 
> > > > How do we know this is going to be small?
> > > 
> > > We don't need to know it's small. If it's over 96 bytes on arm64, it
> > > goes in the kmalloc-128 cache or higher. It can even use the kmalloc-192
> > > cache that's not aligned to ARCH_DMA_MINALIGN (128). That's why I'd
> > > avoid GFP_TINY as this flag is not about size but rather alignment (e.g.
> > > 192 may not be DMA safe but it's larger than 128).
> > > 
> > > That said, I should try to identify sizes > 128 and <= 192 and pass such
> > > flag.
> > 
> > What if the flag is used for large sizes, what will happen?  In other
> > words, why would you ever NOT want to use this?  DMA is a big issue, but
> > then we should flip that around and explicitly mark the times we want
> > DMA, not not-want DMA as "not want" is by far the most common, right?
> 
> Indeed, flipping these flags is the ideal solution. It's just tracking
> them down and I'm not sure coccinelle on its own can handle it (maybe it
> does). As an example of what needs changing:
> 
> ----------------------8<-------------------------
> diff --git a/drivers/infiniband/hw/irdma/osdep.h b/drivers/infiniband/hw/irdma/osdep.h
> index fc1ba2a3e6fb..8ba94d563db3 100644
> --- a/drivers/infiniband/hw/irdma/osdep.h
> +++ b/drivers/infiniband/hw/irdma/osdep.h
> @@ -16,7 +16,7 @@ struct irdma_dma_info {
>  };
> 
>  struct irdma_dma_mem {
> -	void *va;
> +	void __dma *va;
>  	dma_addr_t pa;
>  	u32 size;
>  } __packed;
> diff --git a/drivers/infiniband/hw/irdma/puda.c b/drivers/infiniband/hw/irdma/puda.c
> index 4ec9639f1bdb..ab15c5e812d0 100644
> --- a/drivers/infiniband/hw/irdma/puda.c
> +++ b/drivers/infiniband/hw/irdma/puda.c
> @@ -143,13 +143,13 @@ static struct irdma_puda_buf *irdma_puda_alloc_buf(struct irdma_sc_dev *dev,
>  	struct irdma_virt_mem buf_mem;
> 
>  	buf_mem.size = sizeof(struct irdma_puda_buf);
> -	buf_mem.va = kzalloc(buf_mem.size, GFP_KERNEL);
> +	buf_mem.va = dma_kzalloc(buf_mem.size, GFP_KERNEL);
>  	if (!buf_mem.va)
>  		return NULL;
> 
>  	buf = buf_mem.va;
>  	buf->mem.size = len;
> -	buf->mem.va = kzalloc(buf->mem.size, GFP_KERNEL);
> +	buf->mem.va = dma_kzalloc(buf->mem.size, GFP_KERNEL);
>  	if (!buf->mem.va)
>  		goto free_virt;
>  	buf->mem.pa = dma_map_single(dev->hw->device, buf->mem.va,
> diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
> index 0ee20b764000..8476e6609f35 100644
> --- a/include/linux/dma-mapping.h
> +++ b/include/linux/dma-mapping.h
> @@ -324,7 +324,7 @@ static inline void dma_free_noncoherent(struct device *dev, size_t size,
>  	dma_free_pages(dev, size, virt_to_page(vaddr), dma_handle, dir);
>  }
> 
> -static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr,
> +static inline dma_addr_t dma_map_single_attrs(struct device *dev, void __dma *ptr,
>  		size_t size, enum dma_data_direction dir, unsigned long attrs)
>  {
>  	/* DMA must never operate on areas that might be remapped. */
> ----------------------8<-------------------------
> 
> Basically any pointer type passed to dma_map_single() would need the
> __dma attribute. Once that's done, the next step is changing the
> allocator from kmalloc() to a new dma_kmalloc(). There are other places
> where the pointer gets assigned the value of another pointer (e.g.
> skb->data), so the origin pointer need to inherit the __dma attribute
> (and its original allocator changed).
> 
> The scatterlist API may need changing slightly as it works on pages +
> offsets.

Those pages + offsets better be dma memory pointers too :)

But yes, this looks good, I'd prefer this.  If you want help doing all
of the USB drivers, I'll be glad to do so as that's a huge chunk of
this.

thanks,

greg k-h
Linus Torvalds Oct. 26, 2022, 5:46 p.m. UTC | #6
On Wed, Oct 26, 2022 at 2:49 AM Catalin Marinas <catalin.marinas@arm.com> wrote:
>
> Linus originally suggested GFP_NODMA. We could go with that (and a
> corresponding KMALLOC_NODMA_MINALIGN).

If we do negative a "mark this for smaller allocations because it
cannot do DMA", then yes, I would still prefer GFP_NODMA to make it
explicit that it's about DMA.

But if we're marking allocations, I'd still prefer a "positive"
marking over a negative one, so it would still be much preferable to
just have GFP_DMA and various explicit DMA allocators.

One argument for doing it that way - aside from just the "let's use
positive markers, not negative ones" - is that then the pain of
discovery ends up being solidly on the broken cases, not on sane
architectures with sane platforms.

Seriously, non-cache coherent DMA in 2022 is a sign of an incompetent
platform architect or hardware designer, and at some point I think
that should just be called out for the incredible garbage it is.

It was always garbage, but at least it was excusable garbage back in
the days. And we have a long history of marking DMA allocations
specially, thanks to the original 16MB DMA limit of the ISA PC/AT
platform. That was garbage too, but hey, people acknowledged that, and
it got fixed.

I think we should just stop bending over backwards over this, and say
"if your DMA isn't coherent, it's on your driver to mark its
allocations".

Yes, yes, I realize that there are a billion devices. But not only is
the whole "a billion flies can't be wrong" still wrong, but most of
those billion devices often look remarkably similar, so it's not a
billion drivers. It's a few drivers, and a few driver subsystems, and
it sounds like Greg would prefer the "explicit dma allocations" at
least for USB.

And yes, there are also a few oddball people who love and stick to
their old broken hardware, in some kind of strange Stockholm syndrome
thing from fond memories of when they grew up with broken hardware and
they love the "quirks" of it.

That hardware may then be one of the one-off strange cases, but those
people with their masochistic tendencies can take the pain of "oh, now
I need to mark my broken driver with dma_alloc()". They'll probably
even enjoy it, the poor sods.

                     Linus
Catalin Marinas Oct. 27, 2022, 10:29 p.m. UTC | #7
On Wed, Oct 26, 2022 at 10:46:46AM -0700, Linus Torvalds wrote:
> I think we should just stop bending over backwards over this, and say
> "if your DMA isn't coherent, it's on your driver to mark its
> allocations".
[...]
> That hardware may then be one of the one-off strange cases, but those
> people with their masochistic tendencies can take the pain of "oh, now
> I need to mark my broken driver with dma_alloc()".

The driver is not necessarily broken. The same small kmalloc() in a USB
driver can work fine on a fully coherent platform but if that chip ends
up on a SoC that doesn't support coherent DMA, it needs bigger kmalloc()
alignment. The driver could check if it's coherent but that's more of an
arch detail that the driver shouldn't care about. If we define a new API
like dma_alloc() and drivers don't use it, that's when we can claim they
are broken.

A further optimisation would be for dma_alloc() to take a struct device
pointer and check dev_is_dma_coherent() before deciding to align the
size, though this doesn't work when the allocation place cannot tell the
destination device (e.g. alloc_skb(), though these buffers are
cacheline-aligned already).

Reading up on coccinelle to see if I can make this transition easier. If
not, I'll probably go back to bouncing.
Greg KH Oct. 28, 2022, 9:37 a.m. UTC | #8
On Thu, Oct 27, 2022 at 11:29:48PM +0100, Catalin Marinas wrote:
> On Wed, Oct 26, 2022 at 10:46:46AM -0700, Linus Torvalds wrote:
> > I think we should just stop bending over backwards over this, and say
> > "if your DMA isn't coherent, it's on your driver to mark its
> > allocations".
> [...]
> > That hardware may then be one of the one-off strange cases, but those
> > people with their masochistic tendencies can take the pain of "oh, now
> > I need to mark my broken driver with dma_alloc()".
> 
> The driver is not necessarily broken. The same small kmalloc() in a USB
> driver can work fine on a fully coherent platform but if that chip ends
> up on a SoC that doesn't support coherent DMA, it needs bigger kmalloc()
> alignment. The driver could check if it's coherent but that's more of an
> arch detail that the driver shouldn't care about. If we define a new API
> like dma_alloc() and drivers don't use it, that's when we can claim they
> are broken.
> 
> A further optimisation would be for dma_alloc() to take a struct device
> pointer and check dev_is_dma_coherent() before deciding to align the
> size, though this doesn't work when the allocation place cannot tell the
> destination device (e.g. alloc_skb(), though these buffers are
> cacheline-aligned already).
> 
> Reading up on coccinelle to see if I can make this transition easier. If
> not, I'll probably go back to bouncing.

bouncing?

sparse is your friend here, here's a tiny patch that if you apply and
then build the kernel with sparse will show up all the USB driver
changes that are needed.  (note, sample code only, does not fully work
yet as there are no .c changes made).

I suggest we add something like this now, work on fixing up all of the
drivers for 6.2-rc1, and then you can add the backend allocator changes
after that.  A few rounds of 'make allmodconfig' will show us the places
needing to be fixed up and 0-day will help out with that as well.

Yes it's a lot, but it gives us a fighting chance to do the right thing
going forward with regards to knowing what "type" of memory needs to be
allocated.

thanks,

greg k-h
Greg KH Oct. 28, 2022, 9:37 a.m. UTC | #9
On Fri, Oct 28, 2022 at 11:37:18AM +0200, Greg Kroah-Hartman wrote:
> On Thu, Oct 27, 2022 at 11:29:48PM +0100, Catalin Marinas wrote:
> > On Wed, Oct 26, 2022 at 10:46:46AM -0700, Linus Torvalds wrote:
> > > I think we should just stop bending over backwards over this, and say
> > > "if your DMA isn't coherent, it's on your driver to mark its
> > > allocations".
> > [...]
> > > That hardware may then be one of the one-off strange cases, but those
> > > people with their masochistic tendencies can take the pain of "oh, now
> > > I need to mark my broken driver with dma_alloc()".
> > 
> > The driver is not necessarily broken. The same small kmalloc() in a USB
> > driver can work fine on a fully coherent platform but if that chip ends
> > up on a SoC that doesn't support coherent DMA, it needs bigger kmalloc()
> > alignment. The driver could check if it's coherent but that's more of an
> > arch detail that the driver shouldn't care about. If we define a new API
> > like dma_alloc() and drivers don't use it, that's when we can claim they
> > are broken.
> > 
> > A further optimisation would be for dma_alloc() to take a struct device
> > pointer and check dev_is_dma_coherent() before deciding to align the
> > size, though this doesn't work when the allocation place cannot tell the
> > destination device (e.g. alloc_skb(), though these buffers are
> > cacheline-aligned already).
> > 
> > Reading up on coccinelle to see if I can make this transition easier. If
> > not, I'll probably go back to bouncing.
> 
> bouncing?
> 
> sparse is your friend here, here's a tiny patch that if you apply and
> then build the kernel with sparse will show up all the USB driver
> changes that are needed.  (note, sample code only, does not fully work
> yet as there are no .c changes made).
> 
> I suggest we add something like this now, work on fixing up all of the
> drivers for 6.2-rc1, and then you can add the backend allocator changes
> after that.  A few rounds of 'make allmodconfig' will show us the places
> needing to be fixed up and 0-day will help out with that as well.
> 
> Yes it's a lot, but it gives us a fighting chance to do the right thing
> going forward with regards to knowing what "type" of memory needs to be
> allocated.

And here's actually the patch...


diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h
index eb0466236661..dbc8e013cdaf 100644
--- a/include/linux/compiler_types.h
+++ b/include/linux/compiler_types.h
@@ -23,6 +23,7 @@
 # define __iomem	__attribute__((noderef, address_space(__iomem)))
 # define __percpu	__attribute__((noderef, address_space(__percpu)))
 # define __rcu		__attribute__((noderef, address_space(__rcu)))
+# define __dma		__attribute__((noderef, address_space(__dma)))
 static inline void __chk_user_ptr(const volatile void __user *ptr) { }
 static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 /* context/locking */
@@ -50,6 +51,7 @@ static inline void __chk_io_ptr(const volatile void __iomem *ptr) { }
 # define __iomem
 # define __percpu	BTF_TYPE_TAG(percpu)
 # define __rcu
+# define __dma
 # define __chk_user_ptr(x)	(void)0
 # define __chk_io_ptr(x)	(void)0
 /* context/locking */
diff --git a/include/linux/usb.h b/include/linux/usb.h
index 9ff1ad4dfad1..5f847c921802 100644
--- a/include/linux/usb.h
+++ b/include/linux/usb.h
@@ -1576,7 +1576,7 @@ struct urb {
 	unsigned int stream_id;		/* (in) stream ID */
 	int status;			/* (return) non-ISO status */
 	unsigned int transfer_flags;	/* (in) URB_SHORT_NOT_OK | ...*/
-	void *transfer_buffer;		/* (in) associated data buffer */
+	void __dma *transfer_buffer;	/* (in) associated data buffer */
 	dma_addr_t transfer_dma;	/* (in) dma addr for transfer_buffer */
 	struct scatterlist *sg;		/* (in) scatter gather buffer list */
 	int num_mapped_sgs;		/* (internal) mapped sg entries */
@@ -1616,7 +1616,7 @@ static inline void usb_fill_control_urb(struct urb *urb,
 					struct usb_device *dev,
 					unsigned int pipe,
 					unsigned char *setup_packet,
-					void *transfer_buffer,
+					void __dma *transfer_buffer,
 					int buffer_length,
 					usb_complete_t complete_fn,
 					void *context)
@@ -1646,7 +1646,7 @@ static inline void usb_fill_control_urb(struct urb *urb,
 static inline void usb_fill_bulk_urb(struct urb *urb,
 				     struct usb_device *dev,
 				     unsigned int pipe,
-				     void *transfer_buffer,
+				     void __dma *transfer_buffer,
 				     int buffer_length,
 				     usb_complete_t complete_fn,
 				     void *context)
@@ -1687,7 +1687,7 @@ static inline void usb_fill_bulk_urb(struct urb *urb,
 static inline void usb_fill_int_urb(struct urb *urb,
 				    struct usb_device *dev,
 				    unsigned int pipe,
-				    void *transfer_buffer,
+				    void __dma *transfer_buffer,
 				    int buffer_length,
 				    usb_complete_t complete_fn,
 				    void *context,
@@ -1766,10 +1766,10 @@ static inline int usb_urb_dir_out(struct urb *urb)
 int usb_pipe_type_check(struct usb_device *dev, unsigned int pipe);
 int usb_urb_ep_type_check(const struct urb *urb);
 
-void *usb_alloc_coherent(struct usb_device *dev, size_t size,
+void __dma *usb_alloc_coherent(struct usb_device *dev, size_t size,
 	gfp_t mem_flags, dma_addr_t *dma);
 void usb_free_coherent(struct usb_device *dev, size_t size,
-	void *addr, dma_addr_t dma);
+	void __dma *addr, dma_addr_t dma);
 
 #if 0
 struct urb *usb_buffer_map(struct urb *urb);
Christoph Hellwig Oct. 30, 2022, 8:44 a.m. UTC | #10
On Wed, Oct 26, 2022 at 10:46:46AM -0700, Linus Torvalds wrote:
> Seriously, non-cache coherent DMA in 2022 is a sign of an incompetent
> platform architect or hardware designer, and at some point I think
> that should just be called out for the incredible garbage it is.

It is garbage, but still incredibly common.  And there is a simple
reason for that:  it's cheap.

> I think we should just stop bending over backwards over this, and say
> "if your DMA isn't coherent, it's on your driver to mark its
> allocations".

Many of the allocations do not come from the driver.  They can be
page cache, anonymous user memory, 17 layers of kernel "subsystems"
above the actual driver.  And while the first two usuall won't have
size / alignment problems, the latter is where the real mess is.
Christoph Hellwig Oct. 30, 2022, 8:46 a.m. UTC | #11
On Thu, Oct 27, 2022 at 11:29:48PM +0100, Catalin Marinas wrote:
> The driver is not necessarily broken. The same small kmalloc() in a USB
> driver can work fine on a fully coherent platform but if that chip ends
> up on a SoC that doesn't support coherent DMA, it needs bigger kmalloc()
> alignment. The driver could check if it's coherent but that's more of an
> arch detail that the driver shouldn't care about. If we define a new API
> like dma_alloc() and drivers don't use it, that's when we can claim they
> are broken.
> 
> A further optimisation would be for dma_alloc() to take a struct device
> pointer and check dev_is_dma_coherent() before deciding to align the
> size, though this doesn't work when the allocation place cannot tell the
> destination device (e.g. alloc_skb(), though these buffers are
> cacheline-aligned already).

The other thing would be to check the dma mask so that we don't end
up bounce buffering again.
Christoph Hellwig Oct. 30, 2022, 8:47 a.m. UTC | #12
On Fri, Oct 28, 2022 at 11:37:52AM +0200, Greg Kroah-Hartman wrote:
> And here's actually the patch...

How is this supposed to work?  noderef means any dereference of it will
now make sparse complain.  And the whole point of the DMA is to get
data into and out of the system, so except for corner cases like direct
DMA to userspace (which won't use kmalloc) we absolutely do want to
derference it to generate or consume the information.
Greg KH Oct. 30, 2022, 9:02 a.m. UTC | #13
On Sun, Oct 30, 2022 at 09:47:18AM +0100, Christoph Hellwig wrote:
> On Fri, Oct 28, 2022 at 11:37:52AM +0200, Greg Kroah-Hartman wrote:
> > And here's actually the patch...
> 
> How is this supposed to work?  noderef means any dereference of it will
> now make sparse complain.  And the whole point of the DMA is to get
> data into and out of the system, so except for corner cases like direct
> DMA to userspace (which won't use kmalloc) we absolutely do want to
> derference it to generate or consume the information.

Ah, my fault, sorry, you are right.  Is there a sparse tag that just
means "enforce this void * type?"  I guess we could do that with something
like:
	typedef void *dmaptr;

but that feels icky.
Christoph Hellwig Oct. 30, 2022, 9:13 a.m. UTC | #14
On Sun, Oct 30, 2022 at 10:02:53AM +0100, Greg Kroah-Hartman wrote:
> Ah, my fault, sorry, you are right.  Is there a sparse tag that just
> means "enforce this void * type?"  I guess we could do that with something
> like:
> 	typedef void *dmaptr;
> 
> but that feels icky.

That's because it is :)  I find the concept of the DMA pointes pretty
strange to be honest.  It only affects a subset of dma (when devices
are not attached in a DMA coherent way).  So count me in as someone
who would be much more happy about figuring out a way to simplify
bounce buffer for non-coherent DMA if the start and length are not
aligned to the cache line size over any kind of special annotation
all over the kernel.
Catalin Marinas Oct. 30, 2022, 4:43 p.m. UTC | #15
On Sun, Oct 30, 2022 at 10:13:49AM +0100, Christoph Hellwig wrote:
> On Sun, Oct 30, 2022 at 10:02:53AM +0100, Greg Kroah-Hartman wrote:
> > Ah, my fault, sorry, you are right.  Is there a sparse tag that just
> > means "enforce this void * type?"  I guess we could do that with something
> > like:
> > 	typedef void *dmaptr;
> > 
> > but that feels icky.
> 
> That's because it is :)  I find the concept of the DMA pointes pretty
> strange to be honest.

I've been trying to come up with what the semantics of __dma should be
but couldn't get to any clear conclusion. We can avoid the noderef part
as it doesn't make sense but it still implies a different address space.
Once I made the ptr in dma_map_single() a 'void __dma *', sparse
complained not just about the callers of this function but the callees
like is_vmalloc_addr(). So I have a suspicion we'd end up with __dma
annotations in lots of places unrelated to DMA or drivers. Then we have
structures like 'catc' with a 'ctrl_dr' that ends up in the DMA API (so
far as TO_DEVICE, so not that problematic).

The alternative is for dma_map_*() to (dynamically) verify at the point
of DMA setup rather than at the point of allocation (and bounce).

> It only affects a subset of dma (when devices are not attached in a
> DMA coherent way).  So count me in as someone who would be much more
> happy about figuring out a way to simplify bounce buffer for
> non-coherent DMA if the start and length are not aligned to the cache
> line size over any kind of special annotation all over the kernel.

That's definitely less daunting if we find the right solution. The
problem with checking the alignment of both start and length is that
there are valid cases where the start is aligned but the length isn't.
They don't need bouncing since they come from a sufficiently aligned
slab. We have a few options here:

a) Require that the dma_map_*() functions (including sg) get a size
   multiple of cache_line_size() or the static ARCH_DMA_MINALIGN.

b) Always bounce small sizes as they may have come from a small kmalloc
   cache. The alignment of both ptr and length is ignored. This assumes
   that in-struct alignments (crypto) use ARCH_DMA_MINALIGN and we can
   reduce ARCH_KMALLOC_MINALIGN independently.

c) Similar to (b) but in addition check whether the object comes from a
   small kmalloc cache, e.g. using ksize().

(a) has the advantage that it works by default even if drivers don't use
the correct alignment. We can optimise them afterwards. However, it
feels wrong to get the driver to align the length to a cache_line_size()
when it doesn't control the coherency of the bus (and always worked fine
on x86).

(b) is what I attempted on one of my branches (until I got stuck on
bouncing for iommu). A slight overhead on dma_map_*() to check the
length and we may keep a check on the start alignment (not length
though).

With (c) we can avoid some bouncing if the driver uses the right kmalloc
cache (maybe via a newly introduces dma_kmalloc()) but such check would
add a bit of overhead (and I'm not sure it works with slob). The
advantage is that we can avoid a bounce buffer if the drivers in
question are adapted to use dma_kmalloc().
Christoph Hellwig Nov. 1, 2022, 10:59 a.m. UTC | #16
On Sun, Oct 30, 2022 at 04:43:21PM +0000, Catalin Marinas wrote:
> I've been trying to come up with what the semantics of __dma should be
> but couldn't get to any clear conclusion. We can avoid the noderef part
> as it doesn't make sense but it still implies a different address space.
> Once I made the ptr in dma_map_single() a 'void __dma *', sparse
> complained not just about the callers of this function but the callees
> like is_vmalloc_addr(). So I have a suspicion we'd end up with __dma
> annotations in lots of places unrelated to DMA or drivers. Then we have
> structures like 'catc' with a 'ctrl_dr' that ends up in the DMA API (so
> far as TO_DEVICE, so not that problematic).

Yes, it's all a bit of a mess.

> The alternative is for dma_map_*() to (dynamically) verify at the point
> of DMA setup rather than at the point of allocation (and bounce).

One issue with dma_map_* is that is can't report errors very well.
So I'd really prefer to avoid adding that kind of checking there.

> That's definitely less daunting if we find the right solution. The
> problem with checking the alignment of both start and length is that
> there are valid cases where the start is aligned but the length isn't.

Yes.
> They don't need bouncing since they come from a sufficiently aligned
> slab. We have a few options here:
> 
> a) Require that the dma_map_*() functions (including sg) get a size
>    multiple of cache_line_size() or the static ARCH_DMA_MINALIGN.

I don't think that is going to work, there are just too many instances
all over the tree that pass smaller sizes.

> b) Always bounce small sizes as they may have come from a small kmalloc
>    cache. The alignment of both ptr and length is ignored. This assumes
>    that in-struct alignments (crypto) use ARCH_DMA_MINALIGN and we can
>    reduce ARCH_KMALLOC_MINALIGN independently.

I think the start must be very much aligned.  So what is the problem
with just checking the size?

Otherwise I think this is the way to go.  These tiny maps tend to be
various aux setup path thing and not performance critical (and caring
about performance for not DMA coherent devices isn't something we do
all that much anyway).

> (b) is what I attempted on one of my branches (until I got stuck on
> bouncing for iommu). A slight overhead on dma_map_*() to check the
> length and we may keep a check on the start alignment (not length
> though).

When was that?  These days dma-iommu already has bouncing code for
the untrusted device case, so handling the bouncing there for unaligned
request on non-coherent devices shouldn't be all that horrible.
That leaves the non-dma_iommu non-coherent cases.  Arm should really
switch to dma-iommu, and Robin has been doing work on that which
has been going on for a while.  MIPS/jazz are ISA based boards from
the early 90s and have like 3 drivers that work on them and should
not be affected.  sparc/sun4u only does the to_cpu syncs and IIRC
those are just an optimization and not required for correctness.
Catalin Marinas Nov. 1, 2022, 5:19 p.m. UTC | #17
On Tue, Nov 01, 2022 at 11:59:19AM +0100, Christoph Hellwig wrote:
> On Sun, Oct 30, 2022 at 04:43:21PM +0000, Catalin Marinas wrote:
> > The alternative is for dma_map_*() to (dynamically) verify at the point
> > of DMA setup rather than at the point of allocation (and bounce).
> 
> One issue with dma_map_* is that is can't report errors very well.
> So I'd really prefer to avoid adding that kind of checking there.

By checking I meant bouncing on condition fail rather than reporting
errors.

> > We have a few options here:
> > 
> > a) Require that the dma_map_*() functions (including sg) get a size
> >    multiple of cache_line_size() or the static ARCH_DMA_MINALIGN.
> 
> I don't think that is going to work, there are just too many instances
> all over the tree that pass smaller sizes.
> 
> > b) Always bounce small sizes as they may have come from a small kmalloc
> >    cache. The alignment of both ptr and length is ignored. This assumes
> >    that in-struct alignments (crypto) use ARCH_DMA_MINALIGN and we can
> >    reduce ARCH_KMALLOC_MINALIGN independently.
> 
> I think the start must be very much aligned.  So what is the problem
> with just checking the size?
> 
> Otherwise I think this is the way to go.  These tiny maps tend to be
> various aux setup path thing and not performance critical (and caring
> about performance for not DMA coherent devices isn't something we do
> all that much anyway).

The check is a bit weird as it needs some awareness of the kmalloc
caches to avoid unnecessary bouncing. We can't just check for smaller
than 128 since 192 is another kmalloc cache that is not aligned to an
ARCH_DMA_MINALIGN of 128:

https://git.kernel.org/arm64/c/3508d0d4d5e458e43916bf4f61b29d2a6f15c2bb

I can probably make it a bit nicer by checking the alignment of
kmalloc_size_roundup(size) (this function doesn't seem to have any
callers).

The main downside of bouncing is mobile phones where those vendors are
in the habit of passing noswiotlb on the kernel command line or they
want a very small bounce buffer (hard to pick a universally small size).
I guess we can go ahead with this and, depending on how bad the problem
is, we can look at optimising swiotlb to use a kmem_cache (or aligned
kmalloc) as fall-back for bouncing.

> > (b) is what I attempted on one of my branches (until I got stuck on
> > bouncing for iommu). A slight overhead on dma_map_*() to check the
> > length and we may keep a check on the start alignment (not length
> > though).
> 
> When was that?  These days dma-iommu already has bouncing code for
> the untrusted device case, so handling the bouncing there for unaligned
> request on non-coherent devices shouldn't be all that horrible.

The bouncing currently is all or nothing with iommu_dma_map_sg(), unlike
dma_direct_map_sg() which ends up calling dma_direct_map_page() and we
can do the bouncing per element. So I was looking to untangle
iommu_dma_map_sg() in a similar way but postponed it as too complicated.

As a less than optimal solution, we can force bouncing for the whole
list if any of the sg elements is below the alignment size. Hopefully we
won't have many such mixed size cases.
Christoph Hellwig Nov. 1, 2022, 5:24 p.m. UTC | #18
On Tue, Nov 01, 2022 at 05:19:46PM +0000, Catalin Marinas wrote:
> 
> The main downside of bouncing is mobile phones where those vendors are
> in the habit of passing noswiotlb on the kernel command line or they
> want a very small bounce buffer (hard to pick a universally small size).
> I guess we can go ahead with this and, depending on how bad the problem
> is, we can look at optimising swiotlb to use a kmem_cache (or aligned
> kmalloc) as fall-back for bouncing.

Theses phones setups are completely broken already.  There is a reason
why it needs a debug option to disable swiotlb, and that reason is
that Linux guarantees that the 32-bit DMA always works.  If they force
swiotlb off they can keep the pieces as this is not a supported
configuration.

> As a less than optimal solution, we can force bouncing for the whole
> list if any of the sg elements is below the alignment size. Hopefully we
> won't have many such mixed size cases.

I suspect we won't see any such cases.  The scatterlist is usually used
for large data transfers, and many devices won't like unaligned buffers
for SG operations to start with.  So I think it is a perfectly fine
tradeoff.
Catalin Marinas Nov. 1, 2022, 5:32 p.m. UTC | #19
On Tue, Nov 01, 2022 at 06:24:16PM +0100, Christoph Hellwig wrote:
> On Tue, Nov 01, 2022 at 05:19:46PM +0000, Catalin Marinas wrote:
> > 
> > The main downside of bouncing is mobile phones where those vendors are
> > in the habit of passing noswiotlb on the kernel command line or they
> > want a very small bounce buffer (hard to pick a universally small size).
> > I guess we can go ahead with this and, depending on how bad the problem
> > is, we can look at optimising swiotlb to use a kmem_cache (or aligned
> > kmalloc) as fall-back for bouncing.
> 
> Theses phones setups are completely broken already.  There is a reason
> why it needs a debug option to disable swiotlb, and that reason is
> that Linux guarantees that the 32-bit DMA always works.  If they force
> swiotlb off they can keep the pieces as this is not a supported
> configuration.

There's also the case of low-end phones with all RAM below 4GB and arm64
doesn't allocate the swiotlb. Not sure those vendors would go with a
recent kernel anyway.

So the need for swiotlb now changes from 32-bit DMA to any DMA
(non-coherent but we can't tell upfront when booting, devices may be
initialised pretty late).

> > As a less than optimal solution, we can force bouncing for the whole
> > list if any of the sg elements is below the alignment size. Hopefully we
> > won't have many such mixed size cases.
> 
> I suspect we won't see any such cases.  The scatterlist is usually used
> for large data transfers, and many devices won't like unaligned buffers
> for SG operations to start with.  So I think it is a perfectly fine
> tradeoff.

I'll give it a try this week, hopefully post patches soon(ish).
Christoph Hellwig Nov. 1, 2022, 5:39 p.m. UTC | #20
On Tue, Nov 01, 2022 at 05:32:14PM +0000, Catalin Marinas wrote:
> There's also the case of low-end phones with all RAM below 4GB and arm64
> doesn't allocate the swiotlb. Not sure those vendors would go with a
> recent kernel anyway.
> 
> So the need for swiotlb now changes from 32-bit DMA to any DMA
> (non-coherent but we can't tell upfront when booting, devices may be
> initialised pretty late).

Yes.  The other option would be to use the dma coherent pool for the
bouncing, which must be present on non-coherent systems anyway.  But
it would require us to write a new set of bounce buffering routines.
Robin Murphy Nov. 1, 2022, 6:14 p.m. UTC | #21
On 2022-11-01 17:19, Catalin Marinas wrote:
> On Tue, Nov 01, 2022 at 11:59:19AM +0100, Christoph Hellwig wrote:
>> On Sun, Oct 30, 2022 at 04:43:21PM +0000, Catalin Marinas wrote:
>>> The alternative is for dma_map_*() to (dynamically) verify at the point
>>> of DMA setup rather than at the point of allocation (and bounce).
>>
>> One issue with dma_map_* is that is can't report errors very well.
>> So I'd really prefer to avoid adding that kind of checking there.
> 
> By checking I meant bouncing on condition fail rather than reporting
> errors.
> 
>>> We have a few options here:
>>>
>>> a) Require that the dma_map_*() functions (including sg) get a size
>>>     multiple of cache_line_size() or the static ARCH_DMA_MINALIGN.
>>
>> I don't think that is going to work, there are just too many instances
>> all over the tree that pass smaller sizes.
>>
>>> b) Always bounce small sizes as they may have come from a small kmalloc
>>>     cache. The alignment of both ptr and length is ignored. This assumes
>>>     that in-struct alignments (crypto) use ARCH_DMA_MINALIGN and we can
>>>     reduce ARCH_KMALLOC_MINALIGN independently.
>>
>> I think the start must be very much aligned.  So what is the problem
>> with just checking the size?
>>
>> Otherwise I think this is the way to go.  These tiny maps tend to be
>> various aux setup path thing and not performance critical (and caring
>> about performance for not DMA coherent devices isn't something we do
>> all that much anyway).
> 
> The check is a bit weird as it needs some awareness of the kmalloc
> caches to avoid unnecessary bouncing. We can't just check for smaller
> than 128 since 192 is another kmalloc cache that is not aligned to an
> ARCH_DMA_MINALIGN of 128:
> 
> https://git.kernel.org/arm64/c/3508d0d4d5e458e43916bf4f61b29d2a6f15c2bb
> 
> I can probably make it a bit nicer by checking the alignment of
> kmalloc_size_roundup(size) (this function doesn't seem to have any
> callers).
> 
> The main downside of bouncing is mobile phones where those vendors are
> in the habit of passing noswiotlb on the kernel command line or they
> want a very small bounce buffer (hard to pick a universally small size).
> I guess we can go ahead with this and, depending on how bad the problem
> is, we can look at optimising swiotlb to use a kmem_cache (or aligned
> kmalloc) as fall-back for bouncing.
> 
>>> (b) is what I attempted on one of my branches (until I got stuck on
>>> bouncing for iommu). A slight overhead on dma_map_*() to check the
>>> length and we may keep a check on the start alignment (not length
>>> though).
>>
>> When was that?  These days dma-iommu already has bouncing code for
>> the untrusted device case, so handling the bouncing there for unaligned
>> request on non-coherent devices shouldn't be all that horrible.
> 
> The bouncing currently is all or nothing with iommu_dma_map_sg(), unlike
> dma_direct_map_sg() which ends up calling dma_direct_map_page() and we
> can do the bouncing per element. So I was looking to untangle
> iommu_dma_map_sg() in a similar way but postponed it as too complicated.
> 
> As a less than optimal solution, we can force bouncing for the whole
> list if any of the sg elements is below the alignment size. Hopefully we
> won't have many such mixed size cases.

Sounds like you may have got the wrong impression - the main difference 
with iommu_dma_map_sg_swiotlb() is that it avoids trying to do any of 
the clever concatenation stuff, and simply maps each segment 
individually with iommu_dma_map_page(), exactly like dma-direct; only 
segments which need bouncing actually get bounced.

The reason for tying it to the up-front dev_use_swiotlb() check is 
because sync and unmap have to do the right thing depending on how the 
list was initially mapped, and that's the easiest way to be consistent. 
However, since the P2P stuff landed what I'd like to do now is use the 
new scatterlist->dma_flags to indicate bounced segments in a similar 
fashion to bus-address segments, and so streamline the SWIOTLB bits into 
the main flow in the equivalent manner.

What sadly wouldn't work is just adding extra conditions to 
dev_use_swiotlb() to go down the existing bounce-if-necessary path for 
all non-coherent devices, since there are non-coherent users of dma-buf 
and v4l2 which (for better or worse) depend on the clever concatenation 
stuff happening.

Thanks,
Robin.
Isaac Manjarres Nov. 1, 2022, 7:10 p.m. UTC | #22
On Tue, Nov 01, 2022 at 06:39:40PM +0100, Christoph Hellwig wrote:
> On Tue, Nov 01, 2022 at 05:32:14PM +0000, Catalin Marinas wrote:
> > There's also the case of low-end phones with all RAM below 4GB and arm64
> > doesn't allocate the swiotlb. Not sure those vendors would go with a
> > recent kernel anyway.
> > 
> > So the need for swiotlb now changes from 32-bit DMA to any DMA
> > (non-coherent but we can't tell upfront when booting, devices may be
> > initialised pretty late).
Not only low-end phones, but there are other form-factors that can fall
into this category and are also memory constrained (e.g. wearable
devices), so the memory headroom impact from enabling SWIOTLB might be
non-negligible for all of these devices. I also think it's feasible for
those devices to use recent kernels.
> 
> Yes.  The other option would be to use the dma coherent pool for the
> bouncing, which must be present on non-coherent systems anyway.  But
> it would require us to write a new set of bounce buffering routines.
I think in addition to having to write new bounce buffering routines,
this approach still suffers the same problem as SWIOTLB, which is that
the memory for SWIOTLB and/or the dma coherent pool is not reclaimable,
even when it is not used.

There's not enough context in the DMA mapping routines to know if we need
an atomic allocation, so if we used kmalloc(), instead of SWIOTLB, to
dynamically allocate memory, it would always have to use GFP_ATOMIC. But
what about having a pool that has a small amount of memory and is
composed of several objects that can be used for small DMA transfers? If
the amount of memory in the pool starts falling below a certain
threshold, there can be a worker thread--so that we don't have to use
GFP_ATOMIC--that can add more memory to the pool?

--Isaac
Catalin Marinas Nov. 2, 2022, 11:05 a.m. UTC | #23
On Tue, Nov 01, 2022 at 12:10:51PM -0700, Isaac Manjarres wrote:
> On Tue, Nov 01, 2022 at 06:39:40PM +0100, Christoph Hellwig wrote:
> > On Tue, Nov 01, 2022 at 05:32:14PM +0000, Catalin Marinas wrote:
> > > There's also the case of low-end phones with all RAM below 4GB and arm64
> > > doesn't allocate the swiotlb. Not sure those vendors would go with a
> > > recent kernel anyway.
> > > 
> > > So the need for swiotlb now changes from 32-bit DMA to any DMA
> > > (non-coherent but we can't tell upfront when booting, devices may be
> > > initialised pretty late).
> 
> Not only low-end phones, but there are other form-factors that can fall
> into this category and are also memory constrained (e.g. wearable
> devices), so the memory headroom impact from enabling SWIOTLB might be
> non-negligible for all of these devices. I also think it's feasible for
> those devices to use recent kernels.

Another option I had in mind is to disable this bouncing if there's no
swiotlb buffer, so kmalloc() will return ARCH_DMA_MINALIGN (or the
typically lower cache_line_size()) aligned objects. That's at least
until we find a lighter way to do bouncing. Those devices would work as
before.

> > Yes.  The other option would be to use the dma coherent pool for the
> > bouncing, which must be present on non-coherent systems anyway.  But
> > it would require us to write a new set of bounce buffering routines.
> 
> I think in addition to having to write new bounce buffering routines,
> this approach still suffers the same problem as SWIOTLB, which is that
> the memory for SWIOTLB and/or the dma coherent pool is not reclaimable,
> even when it is not used.

The dma coherent pool at least it has the advantage that its size can be
increased at run-time and we can start with a small one. Not decreased
though, but if really needed I guess it can be added.

We'd also skip some cache maintenance here since the coherent pool is
mapped as non-cacheable already. But to Christoph's point, it does
require some reworking of the current bouncing code.

> There's not enough context in the DMA mapping routines to know if we need
> an atomic allocation, so if we used kmalloc(), instead of SWIOTLB, to
> dynamically allocate memory, it would always have to use GFP_ATOMIC.

I've seen the expression below in a couple of places in the kernel,
though IIUC in_atomic() doesn't always detect atomic contexts:

	gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;

> But what about having a pool that has a small amount of memory and is
> composed of several objects that can be used for small DMA transfers?
> If the amount of memory in the pool starts falling below a certain
> threshold, there can be a worker thread--so that we don't have to use
> GFP_ATOMIC--that can add more memory to the pool?

If the rate of allocation is high, it may end up calling a slab
allocator directly with GFP_ATOMIC.

The main downside of any memory pool is identifying the original pool in
dma_unmap_*(). We have a simple is_swiotlb_buffer() check looking just
at the bounce buffer boundaries. For the coherent pool we have the more
complex dma_free_from_pool().

With a kmem_cache-based allocator (whether it's behind a mempool or
not), we'd need something like virt_to_cache() and checking whether it
is from our DMA cache. I'm not a big fan of digging into the slab
internals for this. An alternative could be some xarray to remember the
bounced dma_addr.

Anyway, I propose that we try the swiotlb first and look at optimising
it from there, initially using the dma coherent pool.
Catalin Marinas Nov. 2, 2022, 1:10 p.m. UTC | #24
On Tue, Nov 01, 2022 at 06:14:58PM +0000, Robin Murphy wrote:
> On 2022-11-01 17:19, Catalin Marinas wrote:
> > The bouncing currently is all or nothing with iommu_dma_map_sg(), unlike
> > dma_direct_map_sg() which ends up calling dma_direct_map_page() and we
> > can do the bouncing per element. So I was looking to untangle
> > iommu_dma_map_sg() in a similar way but postponed it as too complicated.
> > 
> > As a less than optimal solution, we can force bouncing for the whole
> > list if any of the sg elements is below the alignment size. Hopefully we
> > won't have many such mixed size cases.
> 
> Sounds like you may have got the wrong impression - the main difference with
> iommu_dma_map_sg_swiotlb() is that it avoids trying to do any of the clever
> concatenation stuff, and simply maps each segment individually with
> iommu_dma_map_page(), exactly like dma-direct; only segments which need
> bouncing actually get bounced.

You are right, the iommu_dma_map_page() is called for each element if
bouncing is needed. But without scanning the sg separately,
dev_use_swiotlb() would have to be true for all non-coherent devices to
force it through that path. As you said below, this would break some
use-cases.

> What sadly wouldn't work is just adding extra conditions to
> dev_use_swiotlb() to go down the existing bounce-if-necessary path for all
> non-coherent devices, since there are non-coherent users of dma-buf and v4l2
> which (for better or worse) depend on the clever concatenation stuff
> happening.

Would such cases have a length < ARCH_DMA_MINALIGN for any of the
scatterlist elements? If not, maybe scanning the list first would work,
though we probably do need a dma_flag to avoid scanning it again for
sync and unmap.
Isaac Manjarres Nov. 2, 2022, 8:50 p.m. UTC | #25
On Wed, Nov 02, 2022 at 11:05:54AM +0000, Catalin Marinas wrote:
> On Tue, Nov 01, 2022 at 12:10:51PM -0700, Isaac Manjarres wrote:
> > On Tue, Nov 01, 2022 at 06:39:40PM +0100, Christoph Hellwig wrote:
> > > On Tue, Nov 01, 2022 at 05:32:14PM +0000, Catalin Marinas wrote:
> > > > There's also the case of low-end phones with all RAM below 4GB and arm64
> > > > doesn't allocate the swiotlb. Not sure those vendors would go with a
> > > > recent kernel anyway.
> > > > 
> > > > So the need for swiotlb now changes from 32-bit DMA to any DMA
> > > > (non-coherent but we can't tell upfront when booting, devices may be
> > > > initialised pretty late).
> > 
> > Not only low-end phones, but there are other form-factors that can fall
> > into this category and are also memory constrained (e.g. wearable
> > devices), so the memory headroom impact from enabling SWIOTLB might be
> > non-negligible for all of these devices. I also think it's feasible for
> > those devices to use recent kernels.
> 
> Another option I had in mind is to disable this bouncing if there's no
> swiotlb buffer, so kmalloc() will return ARCH_DMA_MINALIGN (or the
> typically lower cache_line_size()) aligned objects. That's at least
> until we find a lighter way to do bouncing. Those devices would work as
> before.

The SWIOTLB buffer will not be allocated in cases with devices that have
low amounts of RAM sitting entirely below 4 GB. Those devices though
would still benefit greatly from kmalloc() using a smaller size for objects, so
it would be unfortunate to not allow this based on the existence of the
SWIOTLB buffer.
> > > Yes.  The other option would be to use the dma coherent pool for the
> > > bouncing, which must be present on non-coherent systems anyway.  But
> > > it would require us to write a new set of bounce buffering routines.
> > 
> > I think in addition to having to write new bounce buffering routines,
> > this approach still suffers the same problem as SWIOTLB, which is that
> > the memory for SWIOTLB and/or the dma coherent pool is not reclaimable,
> > even when it is not used.
> 
> The dma coherent pool at least it has the advantage that its size can be
> increased at run-time and we can start with a small one. Not decreased
> though, but if really needed I guess it can be added.
> 
> We'd also skip some cache maintenance here since the coherent pool is
> mapped as non-cacheable already. But to Christoph's point, it does
> require some reworking of the current bouncing code.
Right, I do think it's a good thing that dma coherent pool starts small
and can grow. I don't think it would be too difficult to add logic to
free the memory back. Perhaps using a shrinker might be sufficient to
free back memory when the system is experiencing memory pressure,
instead of relying on some threshold?

> 
> I've seen the expression below in a couple of places in the kernel,
> though IIUC in_atomic() doesn't always detect atomic contexts:
> 
> 	gfpflags = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL;
> 
I'm not too sure about this; I was going more off of how the mapping
callbacks in iommu/dma-iommu.c use the atomic variants of iommu_map.
> > But what about having a pool that has a small amount of memory and is
> > composed of several objects that can be used for small DMA transfers?
> > If the amount of memory in the pool starts falling below a certain
> > threshold, there can be a worker thread--so that we don't have to use
> > GFP_ATOMIC--that can add more memory to the pool?
> 
> If the rate of allocation is high, it may end up calling a slab
> allocator directly with GFP_ATOMIC.
> 
> The main downside of any memory pool is identifying the original pool in
> dma_unmap_*(). We have a simple is_swiotlb_buffer() check looking just
> at the bounce buffer boundaries. For the coherent pool we have the more
> complex dma_free_from_pool().
> 
> With a kmem_cache-based allocator (whether it's behind a mempool or
> not), we'd need something like virt_to_cache() and checking whether it
> is from our DMA cache. I'm not a big fan of digging into the slab
> internals for this. An alternative could be some xarray to remember the
> bounced dma_addr.
Right. I had actually thought of using something like what is in
mm/dma-pool.c and the dma coherent pool, where the pool is backed
by the page allocator, and the objects are of a fixed size (for ARM64
for example, it would be align(192, ARCH_DMA_MINALIGN) == 256, though
it would be good to have a more generic way of calculating this). Then
determining whether an object resides in the pool boils down to scanning
the backing pages for the pool, which dma coherent pool does.

> Anyway, I propose that we try the swiotlb first and look at optimising
> it from there, initially using the dma coherent pool.
Except for the freeing logic, which can be added if needed as you
pointed out, and Christoph's point about reworking the bouncing code,
dma coherent pool doesn't sound like a bad idea.

--Isaac
Vlastimil Babka Nov. 3, 2022, 4:15 p.m. UTC | #26
On 10/26/22 11:48, Catalin Marinas wrote:
>> > diff --git a/lib/kobject.c b/lib/kobject.c
>> > index a0b2dbfcfa23..2c4acb36925d 100644
>> > --- a/lib/kobject.c
>> > +++ b/lib/kobject.c
>> > @@ -144,7 +144,7 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
>> >  	len = get_kobj_path_length(kobj);
>> >  	if (len == 0)
>> >  		return NULL;
>> > -	path = kzalloc(len, gfp_mask);
>> > +	path = kzalloc(len, gfp_mask | __GFP_PACKED);
>> 
>> This might not be small, and it's going to be very very short-lived
>> (within a single function call), why does it need to be allocated this
>> way?
> 
> Regarding short-lived objects, you are right, they won't affect
> slabinfo. My ftrace-fu is not great, I only looked at the allocation
> hits and they keep adding up without counting how many are
> freed. So maybe we need tracing free() as well but not always easy to
> match against the allocation point and infer how many live objects there
> are.

BTW, since 6.1-rc1 we have a new way with slub_debug to determine how much
memory is wasted, thanks to commit 6edf2576a6cc ("mm/slub: enable debugging
memory wasting of kmalloc") by Feng Tang.

You need to boot the kernel with parameter such as:
slub_debug=U,kmalloc-64,kmalloc-128,kmalloc-192,kmalloc-256
(or just slub_debug=U,kmalloc-* for all sizes, but I guess you are
interested mainly in those that are affected by DMA alignment)
Note it does have some alloc/free CPU overhead and memory overhead, so not
intended for normal production.

Then you can check e.g.
cat /sys/kernel/debug/slab/kmalloc-128/alloc_traces | head -n 50
     77 set_kthread_struct+0x60/0x100 waste=1232/16 age=19492/31067/32465 pid=2 cpus=0-3
        __kmem_cache_alloc_node+0x102/0x340
        kmalloc_trace+0x26/0xa0
        set_kthread_struct+0x60/0x100
        copy_process+0x1903/0x2ee0
        kernel_clone+0xf4/0x4f0
        kernel_thread+0xae/0xe0
        kthreadd+0x491/0x500
        ret_from_fork+0x22/0x30

which tells you there are currently 77 live allocations with this exact
stack trace. The new information in 6.1 is the "waste=1232/16" which
means these allocations waste 16 bytes each due to rounding up to the
kmalloc cache size, or 1232 bytes in total (16*77). This should help
finding the prominent sources of waste.
Catalin Marinas Nov. 3, 2022, 6:03 p.m. UTC | #27
On Thu, Nov 03, 2022 at 05:15:51PM +0100, Vlastimil Babka wrote:
> On 10/26/22 11:48, Catalin Marinas wrote:
> >> > diff --git a/lib/kobject.c b/lib/kobject.c
> >> > index a0b2dbfcfa23..2c4acb36925d 100644
> >> > --- a/lib/kobject.c
> >> > +++ b/lib/kobject.c
> >> > @@ -144,7 +144,7 @@ char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
> >> >  	len = get_kobj_path_length(kobj);
> >> >  	if (len == 0)
> >> >  		return NULL;
> >> > -	path = kzalloc(len, gfp_mask);
> >> > +	path = kzalloc(len, gfp_mask | __GFP_PACKED);
> >> 
> >> This might not be small, and it's going to be very very short-lived
> >> (within a single function call), why does it need to be allocated this
> >> way?
> > 
> > Regarding short-lived objects, you are right, they won't affect
> > slabinfo. My ftrace-fu is not great, I only looked at the allocation
> > hits and they keep adding up without counting how many are
> > freed. So maybe we need tracing free() as well but not always easy to
> > match against the allocation point and infer how many live objects there
> > are.
> 
> BTW, since 6.1-rc1 we have a new way with slub_debug to determine how much
> memory is wasted, thanks to commit 6edf2576a6cc ("mm/slub: enable debugging
> memory wasting of kmalloc") by Feng Tang.
> 
> You need to boot the kernel with parameter such as:
> slub_debug=U,kmalloc-64,kmalloc-128,kmalloc-192,kmalloc-256
> (or just slub_debug=U,kmalloc-* for all sizes, but I guess you are
> interested mainly in those that are affected by DMA alignment)
> Note it does have some alloc/free CPU overhead and memory overhead, so not
> intended for normal production.
> 
> Then you can check e.g.
> cat /sys/kernel/debug/slab/kmalloc-128/alloc_traces | head -n 50
>      77 set_kthread_struct+0x60/0x100 waste=1232/16 age=19492/31067/32465 pid=2 cpus=0-3
>         __kmem_cache_alloc_node+0x102/0x340
>         kmalloc_trace+0x26/0xa0
>         set_kthread_struct+0x60/0x100
>         copy_process+0x1903/0x2ee0
>         kernel_clone+0xf4/0x4f0
>         kernel_thread+0xae/0xe0
>         kthreadd+0x491/0x500
>         ret_from_fork+0x22/0x30
> 
> which tells you there are currently 77 live allocations with this exact
> stack trace. The new information in 6.1 is the "waste=1232/16" which
> means these allocations waste 16 bytes each due to rounding up to the
> kmalloc cache size, or 1232 bytes in total (16*77). This should help
> finding the prominent sources of waste.

Thanks. That's a lot more useful than ftrace for this scenario.

At a quick test in a VM, the above reports about 1200 cases but there
are only around 100 unique allocation places (e.g. kstrdup called from
several places with different sizes). So not too bad if we are to go
with a GFP_ flag.
diff mbox series

Patch

diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c
index 4d59d927ae3e..bff8901dc426 100644
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -525,7 +525,8 @@  int usb_sg_init(struct usb_sg_request *io, struct usb_device *dev,
 	}
 
 	/* initialize all the urbs we'll use */
-	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs), mem_flags);
+	io->urbs = kmalloc_array(io->entries, sizeof(*io->urbs),
+				 mem_flags | __GFP_PACKED);
 	if (!io->urbs)
 		goto nomem;
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 63c7ebb0da89..e5ad1a3244fb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -883,7 +883,8 @@  static int load_elf_binary(struct linux_binprm *bprm)
 			goto out_free_ph;
 
 		retval = -ENOMEM;
-		elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
+		elf_interpreter = kmalloc(elf_ppnt->p_filesz,
+					  GFP_KERNEL | __GFP_PACKED);
 		if (!elf_interpreter)
 			goto out_free_ph;
 
@@ -908,7 +909,8 @@  static int load_elf_binary(struct linux_binprm *bprm)
 		 */
 		would_dump(bprm, interpreter);
 
-		interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
+		interp_elf_ex = kmalloc(sizeof(*interp_elf_ex),
+					GFP_KERNEL | __GFP_PACKED);
 		if (!interp_elf_ex) {
 			retval = -ENOMEM;
 			goto out_free_ph;
diff --git a/fs/dcache.c b/fs/dcache.c
index 52e6d5fdab6b..9c66679365ca 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1785,7 +1785,8 @@  static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 		size_t size = offsetof(struct external_name, name[1]);
 		struct external_name *p = kmalloc(size + name->len,
 						  GFP_KERNEL_ACCOUNT |
-						  __GFP_RECLAIMABLE);
+						  __GFP_RECLAIMABLE |
+						  __GFP_PACKED);
 		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 3985f8c33f95..fc8415ff8880 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -435,7 +435,7 @@  static struct dir_private_info *ext4_htree_create_dir_info(struct file *filp,
 {
 	struct dir_private_info *p;
 
-	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	p = kzalloc(sizeof(*p), GFP_KERNEL | __GFP_PACKED);
 	if (!p)
 		return NULL;
 	p->curr_hash = pos2maj_hash(filp, pos);
@@ -471,7 +471,7 @@  int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 
 	/* Create and allocate the fname structure */
 	len = sizeof(struct fname) + ent_name->len + 1;
-	new_fn = kzalloc(len, GFP_KERNEL);
+	new_fn = kzalloc(len, GFP_KERNEL | __GFP_PACKED);
 	if (!new_fn)
 		return -ENOMEM;
 	new_fn->hash = hash;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index f1956288307f..d8ffb65ac318 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -912,7 +912,7 @@  ext4_find_extent(struct inode *inode, ext4_lblk_t block,
 	if (!path) {
 		/* account possible depth increase */
 		path = kcalloc(depth + 2, sizeof(struct ext4_ext_path),
-				gfp_flags);
+				gfp_flags | __GFP_PACKED);
 		if (unlikely(!path))
 			return ERR_PTR(-ENOMEM);
 		path[0].p_maxdepth = depth + 1;
@@ -2937,7 +2937,7 @@  int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 				le16_to_cpu(path[k].p_hdr->eh_entries)+1;
 	} else {
 		path = kcalloc(depth + 1, sizeof(struct ext4_ext_path),
-			       GFP_NOFS | __GFP_NOFAIL);
+			       GFP_NOFS | __GFP_NOFAIL | __GFP_PACKED);
 		if (path == NULL) {
 			ext4_journal_stop(handle);
 			return -ENOMEM;
diff --git a/fs/file.c b/fs/file.c
index 5f9c802a5d8d..3f732736eaf3 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -129,7 +129,7 @@  static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT | __GFP_PACKED);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 9ab6c92e02da..882980965c55 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -304,7 +304,7 @@  static ssize_t kernfs_fop_write_iter(struct kiocb *iocb, struct iov_iter *iter)
 	if (buf)
 		mutex_lock(&of->prealloc_mutex);
 	else
-		buf = kmalloc(len + 1, GFP_KERNEL);
+		buf = kmalloc(len + 1, GFP_KERNEL | __GFP_PACKED);
 	if (!buf)
 		return -ENOMEM;
 
@@ -565,7 +565,7 @@  static int kernfs_get_open_node(struct kernfs_node *kn,
 
 	if (!on) {
 		/* not there, initialize a new one */
-		on = kzalloc(sizeof(*on), GFP_KERNEL);
+		on = kzalloc(sizeof(*on), GFP_KERNEL | __GFP_PACKED);
 		if (!on) {
 			mutex_unlock(mutex);
 			return -ENOMEM;
@@ -663,7 +663,7 @@  static int kernfs_fop_open(struct inode *inode, struct file *file)
 
 	/* allocate a kernfs_open_file for the file */
 	error = -ENOMEM;
-	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL);
+	of = kzalloc(sizeof(struct kernfs_open_file), GFP_KERNEL | __GFP_PACKED);
 	if (!of)
 		goto err_out;
 
@@ -706,7 +706,7 @@  static int kernfs_fop_open(struct inode *inode, struct file *file)
 		goto err_free;
 	if (ops->prealloc) {
 		int len = of->atomic_write_len ?: PAGE_SIZE;
-		of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL);
+		of->prealloc_buf = kmalloc(len + 1, GFP_KERNEL | __GFP_PACKED);
 		error = -ENOMEM;
 		if (!of->prealloc_buf)
 			goto err_free;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 58036f657126..592ed327b1df 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -78,7 +78,7 @@  alloc_nfs_open_dir_context(struct inode *dir)
 	struct nfs_inode *nfsi = NFS_I(dir);
 	struct nfs_open_dir_context *ctx;
 
-	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL_ACCOUNT | __GFP_PACKED);
 	if (ctx != NULL) {
 		ctx->attr_gencount = nfsi->attr_gencount;
 		ctx->dtsize = NFS_INIT_DTSIZE;
@@ -1230,7 +1230,7 @@  static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	nfs_revalidate_mapping(inode, file->f_mapping);
 
 	res = -ENOMEM;
-	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL | __GFP_PACKED);
 	if (!desc)
 		goto out;
 	desc->file = file;
@@ -3074,7 +3074,8 @@  static void nfs_access_add_rbtree(struct inode *inode,
 void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set,
 			  const struct cred *cred)
 {
-	struct nfs_access_entry *cache = kmalloc(sizeof(*cache), GFP_KERNEL);
+	struct nfs_access_entry *cache = kmalloc(sizeof(*cache),
+						 GFP_KERNEL | __GFP_PACKED);
 	if (cache == NULL)
 		return;
 	RB_CLEAR_NODE(&cache->rb_node);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 6b2cfa59a1a2..993e2e56dccb 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -948,7 +948,7 @@  struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx)
 	res = __nfs_find_lock_context(ctx);
 	rcu_read_unlock();
 	if (res == NULL) {
-		new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT);
+		new = kmalloc(sizeof(*new), GFP_KERNEL_ACCOUNT | __GFP_PACKED);
 		if (new == NULL)
 			return ERR_PTR(-ENOMEM);
 		nfs_init_lock_context(new);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index c3503fb26fa2..4bba9b8f40e8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1074,7 +1074,7 @@  struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
 {
 	struct nfs_seqid *new;
 
-	new = kmalloc(sizeof(*new), gfp_mask);
+	new = kmalloc(sizeof(*new), gfp_mask | __GFP_PACKED);
 	if (new == NULL)
 		return ERR_PTR(-ENOMEM);
 	new->sequence = counter;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index f41d24b54fd1..d6711c600a9d 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -114,7 +114,8 @@  static void nfs_writehdr_free(struct nfs_pgio_header *hdr)
 
 static struct nfs_io_completion *nfs_io_completion_alloc(gfp_t gfp_flags)
 {
-	return kmalloc(sizeof(struct nfs_io_completion), gfp_flags);
+	return kmalloc(sizeof(struct nfs_io_completion), gfp_flags |
+		       __GFP_PACKED);
 }
 
 static void nfs_io_completion_init(struct nfs_io_completion *ioc,
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 49cfe2ae6d23..78b5e41a247e 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -86,7 +86,8 @@  int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask,
 	 * security repercussion.
 	 */
 	old_memcg = set_active_memcg(group->memcg);
-	event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+	event = kmalloc(alloc_len, GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
+			__GFP_PACKED);
 	set_active_memcg(old_memcg);
 
 	if (unlikely(!event)) {
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 72cd69bcaf4a..e97ff8416856 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -19,7 +19,7 @@  static const char *proc_self_get_link(struct dentry *dentry,
 	if (!tgid)
 		return ERR_PTR(-ENOENT);
 	/* max length of unsigned int in decimal + NULL term */
-	name = kmalloc(10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC);
+	name = kmalloc(10 + 1, (dentry ? GFP_KERNEL : GFP_ATOMIC) | __GFP_PACKED);
 	if (unlikely(!name))
 		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%u", tgid);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9456a2032224..326afd30169d 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -572,7 +572,8 @@  static void single_stop(struct seq_file *p, void *v)
 int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		void *data)
 {
-	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT |
+					    __GFP_PACKED);
 	int res = -ENOMEM;
 
 	if (op) {
@@ -634,7 +635,7 @@  void *__seq_open_private(struct file *f, const struct seq_operations *ops,
 	void *private;
 	struct seq_file *seq;
 
-	private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
+	private = kzalloc(psize, GFP_KERNEL_ACCOUNT | __GFP_PACKED);
 	if (private == NULL)
 		goto out;
 
diff --git a/include/acpi/platform/aclinuxex.h b/include/acpi/platform/aclinuxex.h
index 28c72744decf..298ccbc0b949 100644
--- a/include/acpi/platform/aclinuxex.h
+++ b/include/acpi/platform/aclinuxex.h
@@ -49,12 +49,14 @@  acpi_status acpi_os_terminate(void);
  */
 static inline void *acpi_os_allocate(acpi_size size)
 {
-	return kmalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
+	return kmalloc(size, (irqs_disabled()? GFP_ATOMIC : GFP_KERNEL) |
+		       __GFP_PACKED);
 }
 
 static inline void *acpi_os_allocate_zeroed(acpi_size size)
 {
-	return kzalloc(size, irqs_disabled()? GFP_ATOMIC : GFP_KERNEL);
+	return kzalloc(size, (irqs_disabled()? GFP_ATOMIC : GFP_KERNEL) |
+		       __GFP_PACKED);
 }
 
 static inline void acpi_os_free(void *memory)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fbf2543111c0..f7fa75587ed6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -7284,7 +7284,7 @@  static void add_to_clear_hash_list(struct list_head *clear_list,
 {
 	struct ftrace_init_func *func;
 
-	func = kmalloc(sizeof(*func), GFP_KERNEL);
+	func = kmalloc(sizeof(*func), GFP_KERNEL | __GFP_PACKED);
 	if (!func) {
 		MEM_FAIL(1, "alloc failure, ftrace filter could be stale\n");
 		return;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index c774e560f2f9..ab1dc9352122 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -948,7 +948,7 @@  create_sort_entry(void *key, struct tracing_map_elt *elt)
 {
 	struct tracing_map_sort_entry *sort_entry;
 
-	sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL);
+	sort_entry = kzalloc(sizeof(*sort_entry), GFP_KERNEL | __GFP_PACKED);
 	if (!sort_entry)
 		return NULL;
 
diff --git a/lib/kasprintf.c b/lib/kasprintf.c
index cd2f5974ed98..9409012c9a21 100644
--- a/lib/kasprintf.c
+++ b/lib/kasprintf.c
@@ -22,7 +22,7 @@  char *kvasprintf(gfp_t gfp, const char *fmt, va_list ap)
 	first = vsnprintf(NULL, 0, fmt, aq);
 	va_end(aq);
 
-	p = kmalloc_track_caller(first+1, gfp);
+	p = kmalloc_track_caller(first+1, gfp | __GFP_PACKED);
 	if (!p)
 		return NULL;
 
diff --git a/lib/kobject.c b/lib/kobject.c
index a0b2dbfcfa23..2c4acb36925d 100644
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -144,7 +144,7 @@  char *kobject_get_path(struct kobject *kobj, gfp_t gfp_mask)
 	len = get_kobj_path_length(kobj);
 	if (len == 0)
 		return NULL;
-	path = kzalloc(len, gfp_mask);
+	path = kzalloc(len, gfp_mask | __GFP_PACKED);
 	if (!path)
 		return NULL;
 	fill_kobj_path(kobj, path, len);
@@ -749,7 +749,7 @@  static struct kobject *kobject_create(void)
 {
 	struct kobject *kobj;
 
-	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL | __GFP_PACKED);
 	if (!kobj)
 		return NULL;
 
diff --git a/lib/mpi/mpiutil.c b/lib/mpi/mpiutil.c
index aa8c46544af8..441bb69b588e 100644
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -88,7 +88,7 @@  MPI mpi_alloc(unsigned nlimbs)
 {
 	MPI a;
 
-	a = kmalloc(sizeof *a, GFP_KERNEL);
+	a = kmalloc(sizeof *a, GFP_KERNEL | __GFP_PACKED);
 	if (!a)
 		return a;
 
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a05e5bef3b40..754a7598206b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -340,7 +340,8 @@  static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
 	int nid;
 	struct list_lru_memcg *mlru;
 
-	mlru = kmalloc(struct_size(mlru, node, nr_node_ids), gfp);
+	mlru = kmalloc(struct_size(mlru, node, nr_node_ids),
+		       gfp | __GFP_PACKED);
 	if (!mlru)
 		return NULL;
 
@@ -484,7 +485,8 @@  int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
 		return 0;
 
 	gfp &= GFP_RECLAIM_MASK;
-	table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
+	table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table),
+			      gfp | __GFP_PACKED);
 	if (!table)
 		return -ENOMEM;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2d8549ae1b30..626001f081da 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2882,8 +2882,8 @@  int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
 	void *vec;
 
 	gfp &= ~OBJCGS_CLEAR_MASK;
-	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
-			   slab_nid(slab));
+	vec = kcalloc_node(objects, sizeof(struct obj_cgroup *),
+			   gfp | __GFP_PACKED, slab_nid(slab));
 	if (!vec)
 		return -ENOMEM;
 
diff --git a/mm/util.c b/mm/util.c
index 12984e76767e..11c4bbf3849b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -58,7 +58,7 @@  char *kstrdup(const char *s, gfp_t gfp)
 		return NULL;
 
 	len = strlen(s) + 1;
-	buf = kmalloc_track_caller(len, gfp);
+	buf = kmalloc_track_caller(len, gfp | __GFP_PACKED);
 	if (buf)
 		memcpy(buf, s, len);
 	return buf;
@@ -149,7 +149,7 @@  char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
 	if (!s)
 		return NULL;
 
-	buf = kmalloc_track_caller(len + 1, gfp);
+	buf = kmalloc_track_caller(len + 1, gfp | __GFP_PACKED);
 	if (buf) {
 		memcpy(buf, s, len);
 		buf[len] = '\0';
@@ -558,7 +558,7 @@  EXPORT_SYMBOL(vm_mmap);
  */
 void *kvmalloc_node(size_t size, gfp_t flags, int node)
 {
-	gfp_t kmalloc_flags = flags;
+	gfp_t kmalloc_flags = flags | __GFP_PACKED;
 	void *ret;
 
 	/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ccaa461998f3..46d9fa305ec8 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -2491,7 +2491,8 @@  static struct vm_struct *__get_vm_area_node(unsigned long size,
 		align = 1ul << clamp_t(int, get_count_order_long(size),
 				       PAGE_SHIFT, IOREMAP_MAX_ORDER);
 
-	area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
+	area = kzalloc_node(sizeof(*area), (gfp_mask & GFP_RECLAIM_MASK) |
+			    __GFP_PACKED, node);
 	if (unlikely(!area))
 		return NULL;
 
@@ -3026,7 +3027,8 @@  static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
 					area->caller);
 	} else {
-		area->pages = kmalloc_node(array_size, nested_gfp, node);
+		area->pages = kmalloc_node(array_size, nested_gfp |
+					   __GFP_PACKED, node);
 	}
 
 	if (!area->pages) {
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 1e091d3fa607..4129180dc4c8 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -45,7 +45,7 @@  static struct rpc_cred *unx_lookup_cred(struct rpc_auth *auth,
 {
 	struct rpc_cred *ret;
 
-	ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask());
+	ret = kmalloc(sizeof(*ret), rpc_task_gfp_mask() | __GFP_PACKED);
 	if (!ret) {
 		if (!(flags & RPCAUTH_LOOKUP_ASYNC))
 			return ERR_PTR(-ENOMEM);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 336a7c7833e4..e3019e0c8109 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -146,7 +146,7 @@  xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp)
 	size_t i, n = xdr_buf_pagecount(buf);
 
 	if (n != 0 && buf->bvec == NULL) {
-		buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp);
+		buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp | __GFP_PACKED);
 		if (!buf->bvec)
 			return -ENOMEM;
 		for (i = 0; i < n; i++) {
diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c
index f56070270c69..e23d510d57ed 100644
--- a/security/apparmor/lsm.c
+++ b/security/apparmor/lsm.c
@@ -809,7 +809,7 @@  static int apparmor_sk_alloc_security(struct sock *sk, int family, gfp_t flags)
 {
 	struct aa_sk_ctx *ctx;
 
-	ctx = kzalloc(sizeof(*ctx), flags);
+	ctx = kzalloc(sizeof(*ctx), flags | __GFP_PACKED);
 	if (!ctx)
 		return -ENOMEM;
 
diff --git a/security/security.c b/security/security.c
index 79d82cb6e469..b410c284e872 100644
--- a/security/security.c
+++ b/security/security.c
@@ -537,7 +537,7 @@  static int lsm_cred_alloc(struct cred *cred, gfp_t gfp)
 		return 0;
 	}
 
-	cred->security = kzalloc(blob_sizes.lbs_cred, gfp);
+	cred->security = kzalloc(blob_sizes.lbs_cred, gfp | __GFP_PACKED);
 	if (cred->security == NULL)
 		return -ENOMEM;
 	return 0;
@@ -614,7 +614,7 @@  static int lsm_task_alloc(struct task_struct *task)
 		return 0;
 	}
 
-	task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL);
+	task->security = kzalloc(blob_sizes.lbs_task, GFP_KERNEL | __GFP_PACKED);
 	if (task->security == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 1c483ee7f93d..8004d8e9ad66 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -42,7 +42,7 @@  char *tomoyo_encode2(const char *str, int str_len)
 	}
 	len++;
 	/* Reserve space for appending "/". */
-	cp = kzalloc(len + 10, GFP_NOFS);
+	cp = kzalloc(len + 10, GFP_NOFS | __GFP_PACKED);
 	if (!cp)
 		return NULL;
 	cp0 = cp;