diff mbox series

[2/2] fuse: Adjust readdir() buffer to requesting buffer size.

Message ID 20250401142831.25699-3-jaco@uls.co.za (mailing list archive)
State New
Headers show
Series [1/2] fs: Supply dir_context.count as readdir buffer size hint | expand

Commit Message

Jaco Kroon April 1, 2025, 2:18 p.m. UTC
Clamp to min 1 page (4KB) and max 128 pages (512KB).

Glusterfs trial using strace ls -l.

Before:

getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 616
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 624
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 600
getdents64(3, 0x7f2d7d7a7040 /* 25 entries */, 131072) = 608
getdents64(3, 0x7f2d7d7a7040 /* 1 entries */, 131072) = 24
getdents64(3, 0x7f2d7d7a7040 /* 0 entries */, 131072) = 0

After:

getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0

Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
 fs/fuse/readdir.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

Comments

Miklos Szeredi April 1, 2025, 2:40 p.m. UTC | #1
On Tue, 1 Apr 2025 at 16:29, Jaco Kroon <jaco@uls.co.za> wrote:
> After:
>
> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0

This looks great.  But see below.

>
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
>  fs/fuse/readdir.c | 19 +++++++++++++++----
>  1 file changed, 15 insertions(+), 4 deletions(-)
>
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index 17ce9636a2b1..a13534f411b4 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -12,6 +12,7 @@
>  #include <linux/posix_acl.h>
>  #include <linux/pagemap.h>
>  #include <linux/highmem.h>
> +#include <linux/minmax.h>
>
>  static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>  {
> @@ -337,11 +338,21 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>         struct fuse_mount *fm = get_fuse_mount(inode);
>         struct fuse_io_args ia = {};
>         struct fuse_args_pages *ap = &ia.ap;
> -       struct fuse_folio_desc desc = { .length = PAGE_SIZE };
> +       struct fuse_folio_desc desc = { .length = ctx->count };
>         u64 attr_version = 0, evict_ctr = 0;
>         bool locked;
> +       int order;
>
> -       folio = folio_alloc(GFP_KERNEL, 0);
> +       desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
> +       order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
> +
> +       do {
> +               folio = folio_alloc(GFP_KERNEL, order);
> +               if (folio)
> +                       break;
> +               --order;
> +               desc.length = PAGE_SIZE << order;
> +       } while (order >= 0);
>         if (!folio)
>                 return -ENOMEM;

Why not use kvmalloc instead?

We could also implement allocation based on size of result in dev.c to
optimize this, as most directories will be small, but that can be done
later.

Thanks,
Miklos
Jaco Kroon April 1, 2025, 3:03 p.m. UTC | #2
Hi,

On 2025/04/01 16:40, Miklos Szeredi wrote:
> On Tue, 1 Apr 2025 at 16:29, Jaco Kroon <jaco@uls.co.za> wrote:
>> After:
>>
>> getdents64(3, 0x7ffae8eed040 /* 276 entries */, 131072) = 6696
>> getdents64(3, 0x7ffae8eed040 /* 0 entries */, 131072) = 0
> This looks great.  But see below.
>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>>   fs/fuse/readdir.c | 19 +++++++++++++++----
>>   1 file changed, 15 insertions(+), 4 deletions(-)
>>
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index 17ce9636a2b1..a13534f411b4 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -12,6 +12,7 @@
>>   #include <linux/posix_acl.h>
>>   #include <linux/pagemap.h>
>>   #include <linux/highmem.h>
>> +#include <linux/minmax.h>
>>
>>   static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>>   {
>> @@ -337,11 +338,21 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>>          struct fuse_mount *fm = get_fuse_mount(inode);
>>          struct fuse_io_args ia = {};
>>          struct fuse_args_pages *ap = &ia.ap;
>> -       struct fuse_folio_desc desc = { .length = PAGE_SIZE };
>> +       struct fuse_folio_desc desc = { .length = ctx->count };
>>          u64 attr_version = 0, evict_ctr = 0;
>>          bool locked;
>> +       int order;
>>
>> -       folio = folio_alloc(GFP_KERNEL, 0);
>> +       desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
>> +       order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
>> +
>> +       do {
>> +               folio = folio_alloc(GFP_KERNEL, order);
>> +               if (folio)
>> +                       break;
>> +               --order;
>> +               desc.length = PAGE_SIZE << order;
>> +       } while (order >= 0);
>>          if (!folio)
>>                  return -ENOMEM;
> Why not use kvmalloc instead?
Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args 
(ia) expects folios and changing that is more than what I'm capable off, 
and has larger overall impact.
> We could also implement allocation based on size of result in dev.c to
> optimize this, as most directories will be small, but that can be done
> later.

This indeed sounds interesting and would be great, but again, beyond 
what I'm capable of doing at this stage.

Great insights.

Thank you.

Kind regards,
Jaco
Miklos Szeredi April 1, 2025, 3:33 p.m. UTC | #3
On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:

> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
> (ia) expects folios and changing that is more than what I'm capable off,
> and has larger overall impact.

Attaching a minimally tested patch.

Thanks,
Miklos
Jaco Kroon April 2, 2025, 7:54 a.m. UTC | #4
Hi,

I can definitely build on that, thank you.

What's the advantage of kvmalloc over folio's here, why should it be 
preferred?

Kind regards,
Jaco

On 2025/04/01 17:33, Miklos Szeredi wrote:
> On Tue, 1 Apr 2025 at 17:04, Jaco Kroon <jaco@uls.co.za> wrote:
>
>> Because fuse_simple_request via fuse_args_pages (ap) via fuse_io_args
>> (ia) expects folios and changing that is more than what I'm capable off,
>> and has larger overall impact.
> Attaching a minimally tested patch.
>
> Thanks,
> Miklos
Miklos Szeredi April 2, 2025, 8:18 a.m. UTC | #5
On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>
> Hi,
>
> I can definitely build on that, thank you.
>
> What's the advantage of kvmalloc over folio's here, why should it be
> preferred?

It offers the best of both worlds: first tries plain malloc (which
just does a folio alloc internally for size > PAGE_SIZE) and if that
fails, falls back to vmalloc, which should always succeed since it
uses order 0 pages.

This saves the trouble of iterating the folio alloc until it succeeds,
which is both undeterministic and complex, neither of which is
desirable.

Thanks,
Miklos
Jaco Kroon April 2, 2025, 8:52 a.m. UTC | #6
Hi,

On 2025/04/02 10:18, Miklos Szeredi wrote:
> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>> Hi,
>>
>> I can definitely build on that, thank you.
>>
>> What's the advantage of kvmalloc over folio's here, why should it be
>> preferred?
> It offers the best of both worlds: first tries plain malloc (which
> just does a folio alloc internally for size > PAGE_SIZE) and if that
> fails, falls back to vmalloc, which should always succeed since it
> uses order 0 pages.

So basically assigns the space, but doesn't commit physical pages for 
the allocation, meaning first access will cause a page fault, and single 
page allocation at that point in time?  Or is it merely the fact that 
vmalloc may return a virtual contiguous block that's not physically 
contiguous?

Sorry if I'm asking notoriously dumb questions, I've got a VERY BASIC 
grasp of memory management at kernel level, I work much more in 
userspace, and I know there usually first access generates a page fault 
which will then result in memory being physically allocated by the 
kernel.  Generally I ignore these complexities and just assume that the 
"lower down" layers know what they're doing and I've got a "flat, 
contiguous" memory space, and that malloc knows what it's doing and will 
communicate with the kernel regarding which regions of virtual space 
should be mapped.  Love the learning though, so appreciate the feedback 
very much.

>
> This saves the trouble of iterating the folio alloc until it succeeds,
> which is both undeterministic and complex, neither of which is
> desirable.

Agreed.


>
> Thanks,
> Miklos
Bernd Schubert April 2, 2025, 9:10 a.m. UTC | #7
On 4/2/25 10:52, Jaco Kroon wrote:
> Hi,
> 
> On 2025/04/02 10:18, Miklos Szeredi wrote:
>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>> Hi,
>>>
>>> I can definitely build on that, thank you.
>>>
>>> What's the advantage of kvmalloc over folio's here, why should it be
>>> preferred?
>> It offers the best of both worlds: first tries plain malloc (which
>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>> fails, falls back to vmalloc, which should always succeed since it
>> uses order 0 pages.
> 
> So basically assigns the space, but doesn't commit physical pages for
> the allocation, meaning first access will cause a page fault, and single
> page allocation at that point in time?  Or is it merely the fact that
> vmalloc may return a virtual contiguous block that's not physically
> contiguous?


Yes vmalloc return buffers might not be physically contiguous - not
suitable for hardware DMA. And AFAIK it is also a blocking allocation.


Bernd
Jaco Kroon April 2, 2025, 11:13 a.m. UTC | #8
Hi,

On 2025/04/02 11:10, Bernd Schubert wrote:
>
> On 4/2/25 10:52, Jaco Kroon wrote:
>> Hi,
>>
>> On 2025/04/02 10:18, Miklos Szeredi wrote:
>>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>>> Hi,
>>>>
>>>> I can definitely build on that, thank you.
>>>>
>>>> What's the advantage of kvmalloc over folio's here, why should it be
>>>> preferred?
>>> It offers the best of both worlds: first tries plain malloc (which
>>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>>> fails, falls back to vmalloc, which should always succeed since it
>>> uses order 0 pages.
>> So basically assigns the space, but doesn't commit physical pages for
>> the allocation, meaning first access will cause a page fault, and single
>> page allocation at that point in time?  Or is it merely the fact that
>> vmalloc may return a virtual contiguous block that's not physically
>> contiguous?
>
> Yes vmalloc return buffers might not be physically contiguous - not
> suitable for hardware DMA. And AFAIK it is also a blocking allocation.
How do I go about confirming?  Can that behaviour be stopped so that in 
the case where it would block we can return an EAGAIN or EWOULDBLOCK 
error code instead?  Is that even desired?

Don't think hardware DMA is an issue here, so that's at least not an 
issue, but the blocking might be?

Kind regards,
Jaco
Miklos Szeredi April 2, 2025, 11:35 a.m. UTC | #9
On Wed, 2 Apr 2025 at 13:13, Jaco Kroon <jaco@uls.co.za> wrote:

> How do I go about confirming?  Can that behaviour be stopped so that in
> the case where it would block we can return an EAGAIN or EWOULDBLOCK
> error code instead?  Is that even desired?

All allocations except GFP_ATOMIC may block (that applies to
folio_alloc() and kmalloc() too).  This shouldn't be a worry in the
readdir path.  Freeing can safely be done in an atomic context.

Thanks,
Miklos
Bernd Schubert April 2, 2025, 11:59 a.m. UTC | #10
On 4/2/25 13:13, Jaco Kroon wrote:
> Hi,
> 
> On 2025/04/02 11:10, Bernd Schubert wrote:
>>
>> On 4/2/25 10:52, Jaco Kroon wrote:
>>> Hi,
>>>
>>> On 2025/04/02 10:18, Miklos Szeredi wrote:
>>>> On Wed, 2 Apr 2025 at 09:55, Jaco Kroon <jaco@uls.co.za> wrote:
>>>>> Hi,
>>>>>
>>>>> I can definitely build on that, thank you.
>>>>>
>>>>> What's the advantage of kvmalloc over folio's here, why should it be
>>>>> preferred?
>>>> It offers the best of both worlds: first tries plain malloc (which
>>>> just does a folio alloc internally for size > PAGE_SIZE) and if that
>>>> fails, falls back to vmalloc, which should always succeed since it
>>>> uses order 0 pages.
>>> So basically assigns the space, but doesn't commit physical pages for
>>> the allocation, meaning first access will cause a page fault, and single
>>> page allocation at that point in time?  Or is it merely the fact that
>>> vmalloc may return a virtual contiguous block that's not physically
>>> contiguous?
>>
>> Yes vmalloc return buffers might not be physically contiguous - not
>> suitable for hardware DMA. And AFAIK it is also a blocking allocation.
> How do I go about confirming?  Can that behaviour be stopped so that in
> the case where it would block we can return an EAGAIN or EWOULDBLOCK
> error code instead?  Is that even desired?
> 
> Don't think hardware DMA is an issue here, so that's at least not an
> issue, but the blocking might be?

I was just writing what vmalloc does - neither of its disadvantages will
have an issue here
diff mbox series

Patch

diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index 17ce9636a2b1..a13534f411b4 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -12,6 +12,7 @@ 
 #include <linux/posix_acl.h>
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
+#include <linux/minmax.h>
 
 static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
@@ -337,11 +338,21 @@  static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
 	struct fuse_mount *fm = get_fuse_mount(inode);
 	struct fuse_io_args ia = {};
 	struct fuse_args_pages *ap = &ia.ap;
-	struct fuse_folio_desc desc = { .length = PAGE_SIZE };
+	struct fuse_folio_desc desc = { .length = ctx->count };
 	u64 attr_version = 0, evict_ctr = 0;
 	bool locked;
+	int order;
 
-	folio = folio_alloc(GFP_KERNEL, 0);
+	desc.length = clamp(desc.length, PAGE_SIZE, fm->fc->max_pages << PAGE_SHIFT);
+	order = get_count_order(desc.length >> CONFIG_PAGE_SHIFT);
+
+	do {
+		folio = folio_alloc(GFP_KERNEL, order);
+		if (folio)
+			break;
+		--order;
+		desc.length = PAGE_SIZE << order;
+	} while (order >= 0);
 	if (!folio)
 		return -ENOMEM;
 
@@ -353,10 +364,10 @@  static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
 	if (plus) {
 		attr_version = fuse_get_attr_version(fm->fc);
 		evict_ctr = fuse_get_evict_ctr(fm->fc);
-		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+		fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
 				    FUSE_READDIRPLUS);
 	} else {
-		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+		fuse_read_args_fill(&ia, file, ctx->pos, desc.length,
 				    FUSE_READDIR);
 	}
 	locked = fuse_lock_inode(inode);