diff mbox series

fuse: enable larger read buffers for readdir [v2].

Message ID 20230727081237.18217-1-jaco@uls.co.za (mailing list archive)
State New, archived
Headers show
Series fuse: enable larger read buffers for readdir [v2]. | expand

Commit Message

Jaco Kroon July 27, 2023, 8:12 a.m. UTC
This patch does not mess with the caching infrastructure like the
previous one, which we believe caused excessive CPU and broke directory
listings in some cases.

This version only affects the uncached read, which then during parse adds an
entry at a time to the cached structures by way of copying, and as such,
we believe this should be sufficient.

We're still seeing cases where getdents64 takes ~10s (this was the case
in any case without this patch, the difference now that we get ~500
entries for that time rather than the 14-18 previously).  We believe
that that latency is introduced on glusterfs side and is under separate
discussion with the glusterfs developers.

This is still a compile-time option, but a working one compared to
previous patch.  For now this works, but it's not recommended for merge
(as per email discussion).

This still uses alloc_pages rather than kvmalloc/kvfree.

Signed-off-by: Jaco Kroon <jaco@uls.co.za>
---
 fs/fuse/Kconfig   | 16 ++++++++++++++++
 fs/fuse/readdir.c | 18 ++++++++++++------
 2 files changed, 28 insertions(+), 6 deletions(-)

Comments

Miklos Szeredi July 27, 2023, 3:35 p.m. UTC | #1
On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>
> This patch does not mess with the caching infrastructure like the
> previous one, which we believe caused excessive CPU and broke directory
> listings in some cases.
>
> This version only affects the uncached read, which then during parse adds an
> entry at a time to the cached structures by way of copying, and as such,
> we believe this should be sufficient.
>
> We're still seeing cases where getdents64 takes ~10s (this was the case
> in any case without this patch, the difference now that we get ~500
> entries for that time rather than the 14-18 previously).  We believe
> that that latency is introduced on glusterfs side and is under separate
> discussion with the glusterfs developers.
>
> This is still a compile-time option, but a working one compared to
> previous patch.  For now this works, but it's not recommended for merge
> (as per email discussion).
>
> This still uses alloc_pages rather than kvmalloc/kvfree.
>
> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> ---
>  fs/fuse/Kconfig   | 16 ++++++++++++++++
>  fs/fuse/readdir.c | 18 ++++++++++++------
>  2 files changed, 28 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> index 038ed0b9aaa5..0783f9ee5cd3 100644
> --- a/fs/fuse/Kconfig
> +++ b/fs/fuse/Kconfig
> @@ -18,6 +18,22 @@ config FUSE_FS
>           If you want to develop a userspace FS, or if you want to use
>           a filesystem based on FUSE, answer Y or M.
>
> +config FUSE_READDIR_ORDER
> +       int
> +       range 0 5
> +       default 5
> +       help
> +               readdir performance varies greatly depending on the size of the read.
> +               Larger buffers results in larger reads, thus fewer reads and higher
> +               performance in return.
> +
> +               You may want to reduce this value on seriously constrained memory
> +               systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> +
> +               This value reprents the order of the number of pages to allocate (ie,
> +               the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
> +               pages (128KiB).
> +
>  config CUSE
>         tristate "Character device in Userspace support"
>         depends on FUSE_FS
> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> index dc603479b30e..47cea4d91228 100644
> --- a/fs/fuse/readdir.c
> +++ b/fs/fuse/readdir.c
> @@ -13,6 +13,12 @@
>  #include <linux/pagemap.h>
>  #include <linux/highmem.h>
>
> +#define READDIR_PAGES_ORDER            CONFIG_FUSE_READDIR_ORDER
> +#define READDIR_PAGES                  (1 << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_SIZE             (PAGE_SIZE << READDIR_PAGES_ORDER)
> +#define READDIR_PAGES_MASK             (READDIR_PAGES_SIZE - 1)
> +#define READDIR_PAGES_SHIFT            (PAGE_SHIFT + READDIR_PAGES_ORDER)
> +
>  static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>  {
>         struct fuse_conn *fc = get_fuse_conn(dir);
> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>         struct fuse_mount *fm = get_fuse_mount(inode);
>         struct fuse_io_args ia = {};
>         struct fuse_args_pages *ap = &ia.ap;
> -       struct fuse_page_desc desc = { .length = PAGE_SIZE };
> +       struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };

Does this really work?  I would've thought we are relying on single
page lengths somewhere.

>         u64 attr_version = 0;
>         bool locked;
>
> -       page = alloc_page(GFP_KERNEL);
> +       page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>         if (!page)
>                 return -ENOMEM;
>
>         plus = fuse_use_readdirplus(inode, ctx);
>         ap->args.out_pages = true;
> -       ap->num_pages = 1;
> +       ap->num_pages = READDIR_PAGES;

No.  This is the array lenght, which is 1.  This is the hack I guess,
which makes the above trick work.

Better use kvmalloc, which might have a slightly worse performance
than a large page, but definitely not worse than the current single
page.

If we want to optimize the overhead of kvmalloc (and it's a big if)
then the parse_dir*file() functions would need to be converted to
using a page array instead of a plain kernel pointer, which would add
some complexity for sure.

Thanks,
Miklos
Jaco Kroon July 27, 2023, 4:58 p.m. UTC | #2
Hi,

On 2023/07/27 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously).  We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch.  For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>>   fs/fuse/Kconfig   | 16 ++++++++++++++++
>>   fs/fuse/readdir.c | 18 ++++++++++++------
>>   2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>>            If you want to develop a userspace FS, or if you want to use
>>            a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> +       int
>> +       range 0 5
>> +       default 5
>> +       help
>> +               readdir performance varies greatly depending on the size of the read.
>> +               Larger buffers results in larger reads, thus fewer reads and higher
>> +               performance in return.
>> +
>> +               You may want to reduce this value on seriously constrained memory
>> +               systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> +               This value reprents the order of the number of pages to allocate (ie,
>> +               the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
>> +               pages (128KiB).
>> +
>>   config CUSE
>>          tristate "Character device in Userspace support"
>>          depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>>   #include <linux/pagemap.h>
>>   #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER            CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES                  (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE             (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK             (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT            (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>>   static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>>   {
>>          struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>>          struct fuse_mount *fm = get_fuse_mount(inode);
>>          struct fuse_io_args ia = {};
>>          struct fuse_args_pages *ap = &ia.ap;
>> -       struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> +       struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> Does this really work?  I would've thought we are relying on single
> page lengths somewhere.
Based on my testing yes.  Getting just under 500 entries per 
getdents64() call from userspace vs 14-18 before I guess the answer is yes.
>
>>          u64 attr_version = 0;
>>          bool locked;
>>
>> -       page = alloc_page(GFP_KERNEL);
>> +       page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>>          if (!page)
>>                  return -ENOMEM;
>>
>>          plus = fuse_use_readdirplus(inode, ctx);
>>          ap->args.out_pages = true;
>> -       ap->num_pages = 1;
>> +       ap->num_pages = READDIR_PAGES;
> No.  This is the array lenght, which is 1.  This is the hack I guess,
> which makes the above trick work.

Oh?  So the page referenced above isn't an array of pages?  It's 
actually a single piece of contiguous memory?

> Better use kvmalloc, which might have a slightly worse performance
> than a large page, but definitely not worse than the current single
> page.

Which returns a void*, not struct page* - guess this can be converted 
using __virt_to_page (iirc)?

> If we want to optimize the overhead of kvmalloc (and it's a big if)
> then the parse_dir*file() functions would need to be converted to
> using a page array instead of a plain kernel pointer, which would add
> some complexity for sure.

Sorry, I read the above as "I'm surprised this works at all and you're 
not kernel panicking all over the show", this is probably the most 
ambitious kernel patch I've attempted to date.

Kind regards,
Jaco
Bernd Schubert July 27, 2023, 7:16 p.m. UTC | #3
On 7/27/23 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>>
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously).  We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch.  For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>>   fs/fuse/Kconfig   | 16 ++++++++++++++++
>>   fs/fuse/readdir.c | 18 ++++++++++++------
>>   2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>>            If you want to develop a userspace FS, or if you want to use
>>            a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> +       int
>> +       range 0 5
>> +       default 5
>> +       help
>> +               readdir performance varies greatly depending on the size of the read.
>> +               Larger buffers results in larger reads, thus fewer reads and higher
>> +               performance in return.
>> +
>> +               You may want to reduce this value on seriously constrained memory
>> +               systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> +               This value reprents the order of the number of pages to allocate (ie,
>> +               the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
>> +               pages (128KiB).
>> +
>>   config CUSE
>>          tristate "Character device in Userspace support"
>>          depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>>   #include <linux/pagemap.h>
>>   #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER            CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES                  (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE             (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK             (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT            (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>>   static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>>   {
>>          struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>>          struct fuse_mount *fm = get_fuse_mount(inode);
>>          struct fuse_io_args ia = {};
>>          struct fuse_args_pages *ap = &ia.ap;
>> -       struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> +       struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> 
> Does this really work?  I would've thought we are relying on single
> page lengths somewhere.
> 
>>          u64 attr_version = 0;
>>          bool locked;
>>
>> -       page = alloc_page(GFP_KERNEL);
>> +       page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>>          if (!page)
>>                  return -ENOMEM;
>>
>>          plus = fuse_use_readdirplus(inode, ctx);
>>          ap->args.out_pages = true;
>> -       ap->num_pages = 1;
>> +       ap->num_pages = READDIR_PAGES;
> 
> No.  This is the array lenght, which is 1.  This is the hack I guess,
> which makes the above trick work.
> 
> Better use kvmalloc, which might have a slightly worse performance
> than a large page, but definitely not worse than the current single
> page.
> 
> If we want to optimize the overhead of kvmalloc (and it's a big if)
> then the parse_dir*file() functions would need to be converted to
> using a page array instead of a plain kernel pointer, which would add
> some complexity for sure.

One simple possibility might be to do pos=0 with a small buffer size 
single page and only if pos is set  we switch to a larger buffer - that 
way small directories don't get the overhead of the large allocation. 
Although following your idea to to the getdents buffer size - this is 
something libc could already start with.


Cheers,
Bernd
Miklos Szeredi July 27, 2023, 7:17 p.m. UTC | #4
On Thu, 27 Jul 2023 at 18:58, Jaco Kroon <jaco@uls.co.za> wrote:
>
> Hi,
>
> On 2023/07/27 17:35, Miklos Szeredi wrote:
> > On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
> >> This patch does not mess with the caching infrastructure like the
> >> previous one, which we believe caused excessive CPU and broke directory
> >> listings in some cases.
> >>
> >> This version only affects the uncached read, which then during parse adds an
> >> entry at a time to the cached structures by way of copying, and as such,
> >> we believe this should be sufficient.
> >>
> >> We're still seeing cases where getdents64 takes ~10s (this was the case
> >> in any case without this patch, the difference now that we get ~500
> >> entries for that time rather than the 14-18 previously).  We believe
> >> that that latency is introduced on glusterfs side and is under separate
> >> discussion with the glusterfs developers.
> >>
> >> This is still a compile-time option, but a working one compared to
> >> previous patch.  For now this works, but it's not recommended for merge
> >> (as per email discussion).
> >>
> >> This still uses alloc_pages rather than kvmalloc/kvfree.
> >>
> >> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
> >> ---
> >>   fs/fuse/Kconfig   | 16 ++++++++++++++++
> >>   fs/fuse/readdir.c | 18 ++++++++++++------
> >>   2 files changed, 28 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
> >> index 038ed0b9aaa5..0783f9ee5cd3 100644
> >> --- a/fs/fuse/Kconfig
> >> +++ b/fs/fuse/Kconfig
> >> @@ -18,6 +18,22 @@ config FUSE_FS
> >>            If you want to develop a userspace FS, or if you want to use
> >>            a filesystem based on FUSE, answer Y or M.
> >>
> >> +config FUSE_READDIR_ORDER
> >> +       int
> >> +       range 0 5
> >> +       default 5
> >> +       help
> >> +               readdir performance varies greatly depending on the size of the read.
> >> +               Larger buffers results in larger reads, thus fewer reads and higher
> >> +               performance in return.
> >> +
> >> +               You may want to reduce this value on seriously constrained memory
> >> +               systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
> >> +
> >> +               This value reprents the order of the number of pages to allocate (ie,
> >> +               the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
> >> +               pages (128KiB).
> >> +
> >>   config CUSE
> >>          tristate "Character device in Userspace support"
> >>          depends on FUSE_FS
> >> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
> >> index dc603479b30e..47cea4d91228 100644
> >> --- a/fs/fuse/readdir.c
> >> +++ b/fs/fuse/readdir.c
> >> @@ -13,6 +13,12 @@
> >>   #include <linux/pagemap.h>
> >>   #include <linux/highmem.h>
> >>
> >> +#define READDIR_PAGES_ORDER            CONFIG_FUSE_READDIR_ORDER
> >> +#define READDIR_PAGES                  (1 << READDIR_PAGES_ORDER)
> >> +#define READDIR_PAGES_SIZE             (PAGE_SIZE << READDIR_PAGES_ORDER)
> >> +#define READDIR_PAGES_MASK             (READDIR_PAGES_SIZE - 1)
> >> +#define READDIR_PAGES_SHIFT            (PAGE_SHIFT + READDIR_PAGES_ORDER)
> >> +
> >>   static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
> >>   {
> >>          struct fuse_conn *fc = get_fuse_conn(dir);
> >> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
> >>          struct fuse_mount *fm = get_fuse_mount(inode);
> >>          struct fuse_io_args ia = {};
> >>          struct fuse_args_pages *ap = &ia.ap;
> >> -       struct fuse_page_desc desc = { .length = PAGE_SIZE };
> >> +       struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> > Does this really work?  I would've thought we are relying on single
> > page lengths somewhere.
> Based on my testing yes.  Getting just under 500 entries per
> getdents64() call from userspace vs 14-18 before I guess the answer is yes.
> >
> >>          u64 attr_version = 0;
> >>          bool locked;
> >>
> >> -       page = alloc_page(GFP_KERNEL);
> >> +       page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
> >>          if (!page)
> >>                  return -ENOMEM;
> >>
> >>          plus = fuse_use_readdirplus(inode, ctx);
> >>          ap->args.out_pages = true;
> >> -       ap->num_pages = 1;
> >> +       ap->num_pages = READDIR_PAGES;
> > No.  This is the array lenght, which is 1.  This is the hack I guess,
> > which makes the above trick work.
>
> Oh?  So the page referenced above isn't an array of pages?  It's
> actually a single piece of contiguous memory?

Yes.

>
> > Better use kvmalloc, which might have a slightly worse performance
> > than a large page, but definitely not worse than the current single
> > page.
>
> Which returns a void*, not struct page* - guess this can be converted
> using __virt_to_page (iirc)?

No, it cannot be converted to a page or a page array, use it as just a
piece of kernel memory.

Which means:

 - don't set ->args.out_pages and ->num_pages
 - instead set  ->args.out_args[0].value to the allocated pointer

and that should be it (fingers crossed).

>
> > If we want to optimize the overhead of kvmalloc (and it's a big if)
> > then the parse_dir*file() functions would need to be converted to
> > using a page array instead of a plain kernel pointer, which would add
> > some complexity for sure.
>
> Sorry, I read the above as "I'm surprised this works at all and you're
> not kernel panicking all over the show", this is probably the most
> ambitious kernel patch I've attempted to date.

Good start, you'll learn.

Thanks,
Miklos
Bernd Schubert July 27, 2023, 8:35 p.m. UTC | #5
On 7/27/23 17:35, Miklos Szeredi wrote:
> On Thu, 27 Jul 2023 at 10:13, Jaco Kroon <jaco@uls.co.za> wrote:
>>
>> This patch does not mess with the caching infrastructure like the
>> previous one, which we believe caused excessive CPU and broke directory
>> listings in some cases.
>>
>> This version only affects the uncached read, which then during parse adds an
>> entry at a time to the cached structures by way of copying, and as such,
>> we believe this should be sufficient.
>>
>> We're still seeing cases where getdents64 takes ~10s (this was the case
>> in any case without this patch, the difference now that we get ~500
>> entries for that time rather than the 14-18 previously).  We believe
>> that that latency is introduced on glusterfs side and is under separate
>> discussion with the glusterfs developers.
>>
>> This is still a compile-time option, but a working one compared to
>> previous patch.  For now this works, but it's not recommended for merge
>> (as per email discussion).
>>
>> This still uses alloc_pages rather than kvmalloc/kvfree.
>>
>> Signed-off-by: Jaco Kroon <jaco@uls.co.za>
>> ---
>>   fs/fuse/Kconfig   | 16 ++++++++++++++++
>>   fs/fuse/readdir.c | 18 ++++++++++++------
>>   2 files changed, 28 insertions(+), 6 deletions(-)
>>
>> diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
>> index 038ed0b9aaa5..0783f9ee5cd3 100644
>> --- a/fs/fuse/Kconfig
>> +++ b/fs/fuse/Kconfig
>> @@ -18,6 +18,22 @@ config FUSE_FS
>>            If you want to develop a userspace FS, or if you want to use
>>            a filesystem based on FUSE, answer Y or M.
>>
>> +config FUSE_READDIR_ORDER
>> +       int
>> +       range 0 5
>> +       default 5
>> +       help
>> +               readdir performance varies greatly depending on the size of the read.
>> +               Larger buffers results in larger reads, thus fewer reads and higher
>> +               performance in return.
>> +
>> +               You may want to reduce this value on seriously constrained memory
>> +               systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
>> +
>> +               This value reprents the order of the number of pages to allocate (ie,
>> +               the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
>> +               pages (128KiB).
>> +
>>   config CUSE
>>          tristate "Character device in Userspace support"
>>          depends on FUSE_FS
>> diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
>> index dc603479b30e..47cea4d91228 100644
>> --- a/fs/fuse/readdir.c
>> +++ b/fs/fuse/readdir.c
>> @@ -13,6 +13,12 @@
>>   #include <linux/pagemap.h>
>>   #include <linux/highmem.h>
>>
>> +#define READDIR_PAGES_ORDER            CONFIG_FUSE_READDIR_ORDER
>> +#define READDIR_PAGES                  (1 << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_SIZE             (PAGE_SIZE << READDIR_PAGES_ORDER)
>> +#define READDIR_PAGES_MASK             (READDIR_PAGES_SIZE - 1)
>> +#define READDIR_PAGES_SHIFT            (PAGE_SHIFT + READDIR_PAGES_ORDER)
>> +
>>   static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
>>   {
>>          struct fuse_conn *fc = get_fuse_conn(dir);
>> @@ -328,25 +334,25 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
>>          struct fuse_mount *fm = get_fuse_mount(inode);
>>          struct fuse_io_args ia = {};
>>          struct fuse_args_pages *ap = &ia.ap;
>> -       struct fuse_page_desc desc = { .length = PAGE_SIZE };
>> +       struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
> 
> Does this really work?  I would've thought we are relying on single
> page lengths somewhere.
> 
>>          u64 attr_version = 0;
>>          bool locked;
>>
>> -       page = alloc_page(GFP_KERNEL);
>> +       page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
>>          if (!page)
>>                  return -ENOMEM;
>>
>>          plus = fuse_use_readdirplus(inode, ctx);
>>          ap->args.out_pages = true;
>> -       ap->num_pages = 1;
>> +       ap->num_pages = READDIR_PAGES;
> 
> No.  This is the array lenght, which is 1.  This is the hack I guess,
> which makes the above trick work.

Hmm, ap->num_pages / ap->pages[] is used in fuse_copy_pages, but so is 
ap->descs[] - shouldn't the patch caused an out-of-bound access?
Out of interest, would you mind to explain how the hack worked?


Thanks,
Bernd
Jaco Kroon July 28, 2023, 5:05 a.m. UTC | #6
Hi,
>>>          plus = fuse_use_readdirplus(inode, ctx);
>>>          ap->args.out_pages = true;
>>> -       ap->num_pages = 1;
>>> +       ap->num_pages = READDIR_PAGES;
>>
>> No.  This is the array lenght, which is 1.  This is the hack I guess,
>> which makes the above trick work.
>
> Hmm, ap->num_pages / ap->pages[] is used in fuse_copy_pages, but so is 
> ap->descs[] - shouldn't the patch caused an out-of-bound access?
> Out of interest, would you mind to explain how the hack worked?

Apparently it shouldn't ... my understanding of how pages* worked was 
all wrong.

I'm guessing since all the data fits in the first page (ap->pages[0] in 
other words, of length/size desc.length) that the other pages are never 
accessed.  Looking at fuse_copy_pages this does indeed seem to be the 
case.  So I ended up just being really, really lucky here.

Kind regards,
Jaco
Jaco Kroon March 14, 2025, 10:16 p.m. UTC | #7
This is a follow up to the attempt made a while ago.

Whist the patch worked, newer kernels have moved from pages to folios,
which gave me the motivation to implement the mechanism based on the
userspace buffer size patch that Miklos supplied.

That patch works as is, but I note there are changes to components
(overlayfs and exportfs) that I've got very little experience with, and
have not tested specifically here.  They do look logical.  I've marked
Miklos as the Author: here, and added my own Signed-off-by - I hope this
is correct.

The second patch in the series implements the changes to fuse's readdir
in order to utilize the first to enable reading more than one page of
dent structures at a time from userspace, I've included a strace from
before and after this patch in the commit to illustrate the difference.

To get the relevant performance on glusterfs improved (which was
mentioned elsewhere in the thread) changes to glusterfs to increase the
number of cached dentries is also required (these are pushed to github
but not yet merged, because similar to this patch, got stalled before
getting to the "ready for merge" phase even though it's operational).

Please advise if these two patches looks good (I've only done relatively
basic testing now, and it's not running on production systems yet)

Kind regards,
Jaco
diff mbox series

Patch

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 038ed0b9aaa5..0783f9ee5cd3 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -18,6 +18,22 @@  config FUSE_FS
 	  If you want to develop a userspace FS, or if you want to use
 	  a filesystem based on FUSE, answer Y or M.
 
+config FUSE_READDIR_ORDER
+	int
+	range 0 5
+	default 5
+	help
+		readdir performance varies greatly depending on the size of the read.
+		Larger buffers results in larger reads, thus fewer reads and higher
+		performance in return.
+
+		You may want to reduce this value on seriously constrained memory
+		systems where 128KiB (assuming 4KiB pages) cache pages is not ideal.
+
+		This value reprents the order of the number of pages to allocate (ie,
+		the shift value).  A value of 0 is thus 1 page (4KiB) where 5 is 32
+		pages (128KiB).
+
 config CUSE
 	tristate "Character device in Userspace support"
 	depends on FUSE_FS
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index dc603479b30e..47cea4d91228 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -13,6 +13,12 @@ 
 #include <linux/pagemap.h>
 #include <linux/highmem.h>
 
+#define READDIR_PAGES_ORDER		CONFIG_FUSE_READDIR_ORDER
+#define READDIR_PAGES			(1 << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_SIZE		(PAGE_SIZE << READDIR_PAGES_ORDER)
+#define READDIR_PAGES_MASK		(READDIR_PAGES_SIZE - 1)
+#define READDIR_PAGES_SHIFT		(PAGE_SHIFT + READDIR_PAGES_ORDER)
+
 static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
@@ -328,25 +334,25 @@  static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
 	struct fuse_mount *fm = get_fuse_mount(inode);
 	struct fuse_io_args ia = {};
 	struct fuse_args_pages *ap = &ia.ap;
-	struct fuse_page_desc desc = { .length = PAGE_SIZE };
+	struct fuse_page_desc desc = { .length = READDIR_PAGES_SIZE };
 	u64 attr_version = 0;
 	bool locked;
 
-	page = alloc_page(GFP_KERNEL);
+	page = alloc_pages(GFP_KERNEL, READDIR_PAGES_ORDER);
 	if (!page)
 		return -ENOMEM;
 
 	plus = fuse_use_readdirplus(inode, ctx);
 	ap->args.out_pages = true;
-	ap->num_pages = 1;
+	ap->num_pages = READDIR_PAGES;
 	ap->pages = &page;
 	ap->descs = &desc;
 	if (plus) {
 		attr_version = fuse_get_attr_version(fm->fc);
-		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+		fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
 				    FUSE_READDIRPLUS);
 	} else {
-		fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE,
+		fuse_read_args_fill(&ia, file, ctx->pos, READDIR_PAGES_SIZE,
 				    FUSE_READDIR);
 	}
 	locked = fuse_lock_inode(inode);
@@ -367,7 +373,7 @@  static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx)
 		}
 	}
 
-	__free_page(page);
+	__free_pages(page, READDIR_PAGES_ORDER);
 	fuse_invalidate_atime(inode);
 	return res;
 }