Message ID | 20240103105929.1902658-1-houtao@huaweicloud.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | virtiofs: limit the length of ITER_KVEC dio by max_nopage_rw | expand |
On Wed, Jan 03, 2024 at 06:59:29PM +0800, Hou Tao wrote: > From: Hou Tao <houtao1@huawei.com> > > When trying to insert a 10MB kernel module kept in a virtiofs with cache > disabled, the following warning was reported: > > ------------[ cut here ]------------ > WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... > Modules linked in: > CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... > RIP: 0010:__alloc_pages+0x2c4/0x360 > ...... > Call Trace: > <TASK> > ? __warn+0x8f/0x150 > ? __alloc_pages+0x2c4/0x360 > __kmalloc_large_node+0x86/0x160 > __kmalloc+0xcd/0x140 > virtio_fs_enqueue_req+0x240/0x6d0 > virtio_fs_wake_pending_and_unlock+0x7f/0x190 > queue_request_and_unlock+0x58/0x70 > fuse_simple_request+0x18b/0x2e0 > fuse_direct_io+0x58a/0x850 > fuse_file_read_iter+0xdb/0x130 > __kernel_read+0xf3/0x260 > kernel_read+0x45/0x60 > kernel_read_file+0x1ad/0x2b0 > init_module_from_file+0x6a/0xe0 > idempotent_init_module+0x179/0x230 > __x64_sys_finit_module+0x5d/0xb0 > do_syscall_64+0x36/0xb0 > entry_SYSCALL_64_after_hwframe+0x6e/0x76 > ...... > </TASK> > ---[ end trace 0000000000000000 ]--- > > The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses > kmalloc-ed memory as bound buffer for fuse args, but > fuse_get_user_pages() only limits the length of fuse arg by max_read or > max_write for IOV_KVEC io (e.g., kernel_read_file from finit_module()). > For virtiofs, max_read is UINT_MAX, so a big read request which is about > 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn > with len=10MB, and triggers the warning in __alloc_pages(): > WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). > > A feasible solution is to limit the value of max_read for virtiofs, so > the length passed to kmalloc() will be limited. However it will affects > the max read size for ITER_IOVEC io and the value of max_write also needs > limitation. So instead of limiting the values of max_read and max_write, > introducing max_nopage_rw to cap both the values of max_read and > max_write when the fuse dio read/write request is initiated from kernel. > > Considering that fuse read/write request from kernel is uncommon and to > decrease the demand for large contiguous pages, set max_nopage_rw as > 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. > > Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") > Signed-off-by: Hou Tao <houtao1@huawei.com> Could this get some acks from virtio fs maintainers pls? > --- > fs/fuse/file.c | 12 +++++++++++- > fs/fuse/fuse_i.h | 3 +++ > fs/fuse/inode.c | 1 + > fs/fuse/virtio_fs.c | 6 ++++++ > 4 files changed, 21 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index a660f1f21540..f1beb7c0b782 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, > return ret < 0 ? ret : 0; > } > > +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, > + const struct iov_iter *iter, int write) > +{ > + unsigned int nmax = write ? fc->max_write : fc->max_read; > + > + if (iov_iter_is_kvec(iter)) > + nmax = min(nmax, fc->max_nopage_rw); > + return nmax; > +} > + > ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > loff_t *ppos, int flags) > { > @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > struct inode *inode = mapping->host; > struct fuse_file *ff = file->private_data; > struct fuse_conn *fc = ff->fm->fc; > - size_t nmax = write ? fc->max_write : fc->max_read; > + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); > loff_t pos = *ppos; > size_t count = iov_iter_count(iter); > pgoff_t idx_from = pos >> PAGE_SHIFT; > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 1df83eebda92..fc753cd34211 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -594,6 +594,9 @@ struct fuse_conn { > /** Constrain ->max_pages to this value during feature negotiation */ > unsigned int max_pages_limit; > > + /** Maximum read/write size when there is no page in request */ > + unsigned int max_nopage_rw; > + > /** Input queue */ > struct fuse_iqueue iq; > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 2a6d44f91729..4cbbcb4a4b71 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > fc->user_ns = get_user_ns(user_ns); > fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; > fc->max_pages_limit = FUSE_MAX_MAX_PAGES; > + fc->max_nopage_rw = UINT_MAX; > > INIT_LIST_HEAD(&fc->mounts); > list_add(&fm->fc_entry, &fc->mounts); > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 5f1be1da92ce..3aac31d45198 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct fs_context *fsc) > /* Tell FUSE to split requests that exceed the virtqueue's size */ > fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, > virtqueue_size - FUSE_HEADER_OVERHEAD); > + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer > + * for fuse args, so limit the total size of these args to prevent > + * the warning in __alloc_pages() and decrease the demand for large > + * contiguous pages. > + */ > + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); > > fsc->s_fs_info = fm; > sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); > -- > 2.29.2
On 1/3/24 11:59, Hou Tao wrote: > From: Hou Tao <houtao1@huawei.com> > > When trying to insert a 10MB kernel module kept in a virtiofs with cache > disabled, the following warning was reported: > > ------------[ cut here ]------------ > WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... > Modules linked in: > CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... > RIP: 0010:__alloc_pages+0x2c4/0x360 > ...... > Call Trace: > <TASK> > ? __warn+0x8f/0x150 > ? __alloc_pages+0x2c4/0x360 > __kmalloc_large_node+0x86/0x160 > __kmalloc+0xcd/0x140 > virtio_fs_enqueue_req+0x240/0x6d0 > virtio_fs_wake_pending_and_unlock+0x7f/0x190 > queue_request_and_unlock+0x58/0x70 > fuse_simple_request+0x18b/0x2e0 > fuse_direct_io+0x58a/0x850 > fuse_file_read_iter+0xdb/0x130 > __kernel_read+0xf3/0x260 > kernel_read+0x45/0x60 > kernel_read_file+0x1ad/0x2b0 > init_module_from_file+0x6a/0xe0 > idempotent_init_module+0x179/0x230 > __x64_sys_finit_module+0x5d/0xb0 > do_syscall_64+0x36/0xb0 > entry_SYSCALL_64_after_hwframe+0x6e/0x76 > ...... > </TASK> > ---[ end trace 0000000000000000 ]--- > > The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses > kmalloc-ed memory as bound buffer for fuse args, but > fuse_get_user_pages() only limits the length of fuse arg by max_read or > max_write for IOV_KVEC io (e.g., kernel_read_file from finit_module()). > For virtiofs, max_read is UINT_MAX, so a big read request which is about I find this part of the explanation a bit confusing. I guess you wanted to write something like fuse_direct_io() -> fuse_get_user_pages() is limited by fc->max_write/fc->max_read and fc->max_pages. For virtiofs max_pages does not apply as ITER_KVEC is used. As virtiofs sets fc->max_read to UINT_MAX basically no limit is applied at all. I also wonder if it wouldn't it make sense to set a sensible limit in virtio_fs_ctx_set_defaults() instead of introducing a new variable? Also, I guess the issue is kmalloc_array() in virtio_fs_enqueue_req? Wouldn't it make sense to use kvm_alloc_array/kvfree in that function? Thanks, Bernd > 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn > with len=10MB, and triggers the warning in __alloc_pages(): > WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). > > A feasible solution is to limit the value of max_read for virtiofs, so > the length passed to kmalloc() will be limited. However it will affects > the max read size for ITER_IOVEC io and the value of max_write also needs > limitation. So instead of limiting the values of max_read and max_write, > introducing max_nopage_rw to cap both the values of max_read and > max_write when the fuse dio read/write request is initiated from kernel. > > Considering that fuse read/write request from kernel is uncommon and to > decrease the demand for large contiguous pages, set max_nopage_rw as > 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. > > Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") > Signed-off-by: Hou Tao <houtao1@huawei.com> > --- > fs/fuse/file.c | 12 +++++++++++- > fs/fuse/fuse_i.h | 3 +++ > fs/fuse/inode.c | 1 + > fs/fuse/virtio_fs.c | 6 ++++++ > 4 files changed, 21 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index a660f1f21540..f1beb7c0b782 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, > return ret < 0 ? ret : 0; > } > > +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, > + const struct iov_iter *iter, int write) > +{ > + unsigned int nmax = write ? fc->max_write : fc->max_read; > + > + if (iov_iter_is_kvec(iter)) > + nmax = min(nmax, fc->max_nopage_rw); > + return nmax; > +} > + > ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > loff_t *ppos, int flags) > { > @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > struct inode *inode = mapping->host; > struct fuse_file *ff = file->private_data; > struct fuse_conn *fc = ff->fm->fc; > - size_t nmax = write ? fc->max_write : fc->max_read; > + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); > loff_t pos = *ppos; > size_t count = iov_iter_count(iter); > pgoff_t idx_from = pos >> PAGE_SHIFT; > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 1df83eebda92..fc753cd34211 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -594,6 +594,9 @@ struct fuse_conn { > /** Constrain ->max_pages to this value during feature negotiation */ > unsigned int max_pages_limit; > > + /** Maximum read/write size when there is no page in request */ > + unsigned int max_nopage_rw; > + > /** Input queue */ > struct fuse_iqueue iq; > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 2a6d44f91729..4cbbcb4a4b71 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > fc->user_ns = get_user_ns(user_ns); > fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; > fc->max_pages_limit = FUSE_MAX_MAX_PAGES; > + fc->max_nopage_rw = UINT_MAX; > > INIT_LIST_HEAD(&fc->mounts); > list_add(&fm->fc_entry, &fc->mounts); > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 5f1be1da92ce..3aac31d45198 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct fs_context *fsc) > /* Tell FUSE to split requests that exceed the virtqueue's size */ > fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, > virtqueue_size - FUSE_HEADER_OVERHEAD); > + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer > + * for fuse args, so limit the total size of these args to prevent > + * the warning in __alloc_pages() and decrease the demand for large > + * contiguous pages. > + */ > + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); > > fsc->s_fs_info = fm; > sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);
Hi, On 1/9/2024 9:11 PM, Bernd Schubert wrote: > > > On 1/3/24 11:59, Hou Tao wrote: >> From: Hou Tao <houtao1@huawei.com> >> >> When trying to insert a 10MB kernel module kept in a virtiofs with cache >> disabled, the following warning was reported: >> >> ------------[ cut here ]------------ >> WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... >> Modules linked in: >> CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... >> RIP: 0010:__alloc_pages+0x2c4/0x360 >> ...... >> Call Trace: >> <TASK> >> ? __warn+0x8f/0x150 >> ? __alloc_pages+0x2c4/0x360 >> __kmalloc_large_node+0x86/0x160 >> __kmalloc+0xcd/0x140 >> virtio_fs_enqueue_req+0x240/0x6d0 >> virtio_fs_wake_pending_and_unlock+0x7f/0x190 >> queue_request_and_unlock+0x58/0x70 >> fuse_simple_request+0x18b/0x2e0 >> fuse_direct_io+0x58a/0x850 >> fuse_file_read_iter+0xdb/0x130 >> __kernel_read+0xf3/0x260 >> kernel_read+0x45/0x60 >> kernel_read_file+0x1ad/0x2b0 >> init_module_from_file+0x6a/0xe0 >> idempotent_init_module+0x179/0x230 >> __x64_sys_finit_module+0x5d/0xb0 >> do_syscall_64+0x36/0xb0 >> entry_SYSCALL_64_after_hwframe+0x6e/0x76 >> ...... >> </TASK> >> ---[ end trace 0000000000000000 ]--- >> >> The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses >> kmalloc-ed memory as bound buffer for fuse args, but >> fuse_get_user_pages() only limits the length of fuse arg by max_read or >> max_write for IOV_KVEC io (e.g., kernel_read_file from finit_module()). >> For virtiofs, max_read is UINT_MAX, so a big read request which is about > > > I find this part of the explanation a bit confusing. I guess you > wanted to write something like > > fuse_direct_io() -> fuse_get_user_pages() is limited by > fc->max_write/fc->max_read and fc->max_pages. For virtiofs max_pages > does not apply as ITER_KVEC is used. As virtiofs sets fc->max_read to > UINT_MAX basically no limit is applied at all. Yes, what you said is just as expected but it is not the root cause of the warning. The culprit of the warning is kmalloc() in copy_args_to_argbuf() just as said in commit message. vmalloc() is also not acceptable, because the physical memory needs to be contiguous. For the problem, because there is no page involved, so there will be extra sg available, maybe we can use these sg to break the big read/write request into page. > > I also wonder if it wouldn't it make sense to set a sensible limit in > virtio_fs_ctx_set_defaults() instead of introducing a new variable? As said in the commit message: A feasible solution is to limit the value of max_read for virtiofs, so the length passed to kmalloc() will be limited. However it will affects the max read size for ITER_IOVEC io and the value of max_write also needs limitation. It is a bit hard to set a reasonable value for both max_read and max_write to handle both normal ITER_IOVEC io and ITER_KVEC io. And considering ITER_KVEC io + dio case is uncommon, I think using a new limitation is more reasonable. > > Also, I guess the issue is kmalloc_array() in virtio_fs_enqueue_req? > Wouldn't it make sense to use kvm_alloc_array/kvfree in that function? > > > Thanks, > Bernd > > >> 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn >> with len=10MB, and triggers the warning in __alloc_pages(): >> WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). >> >> A feasible solution is to limit the value of max_read for virtiofs, so >> the length passed to kmalloc() will be limited. However it will affects >> the max read size for ITER_IOVEC io and the value of max_write also >> needs >> limitation. So instead of limiting the values of max_read and max_write, >> introducing max_nopage_rw to cap both the values of max_read and >> max_write when the fuse dio read/write request is initiated from kernel. >> >> Considering that fuse read/write request from kernel is uncommon and to >> decrease the demand for large contiguous pages, set max_nopage_rw as >> 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. >> >> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") >> Signed-off-by: Hou Tao <houtao1@huawei.com> >> --- >> fs/fuse/file.c | 12 +++++++++++- >> fs/fuse/fuse_i.h | 3 +++ >> fs/fuse/inode.c | 1 + >> fs/fuse/virtio_fs.c | 6 ++++++ >> 4 files changed, 21 insertions(+), 1 deletion(-) >> >> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >> index a660f1f21540..f1beb7c0b782 100644 >> --- a/fs/fuse/file.c >> +++ b/fs/fuse/file.c >> @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct >> fuse_args_pages *ap, struct iov_iter *ii, >> return ret < 0 ? ret : 0; >> } >> +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, >> + const struct iov_iter *iter, int write) >> +{ >> + unsigned int nmax = write ? fc->max_write : fc->max_read; >> + >> + if (iov_iter_is_kvec(iter)) >> + nmax = min(nmax, fc->max_nopage_rw); >> + return nmax; >> +} >> + >> ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, >> loff_t *ppos, int flags) >> { >> @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, >> struct iov_iter *iter, >> struct inode *inode = mapping->host; >> struct fuse_file *ff = file->private_data; >> struct fuse_conn *fc = ff->fm->fc; >> - size_t nmax = write ? fc->max_write : fc->max_read; >> + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); >> loff_t pos = *ppos; >> size_t count = iov_iter_count(iter); >> pgoff_t idx_from = pos >> PAGE_SHIFT; >> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >> index 1df83eebda92..fc753cd34211 100644 >> --- a/fs/fuse/fuse_i.h >> +++ b/fs/fuse/fuse_i.h >> @@ -594,6 +594,9 @@ struct fuse_conn { >> /** Constrain ->max_pages to this value during feature >> negotiation */ >> unsigned int max_pages_limit; >> + /** Maximum read/write size when there is no page in request */ >> + unsigned int max_nopage_rw; >> + >> /** Input queue */ >> struct fuse_iqueue iq; >> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >> index 2a6d44f91729..4cbbcb4a4b71 100644 >> --- a/fs/fuse/inode.c >> +++ b/fs/fuse/inode.c >> @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct >> fuse_mount *fm, >> fc->user_ns = get_user_ns(user_ns); >> fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; >> fc->max_pages_limit = FUSE_MAX_MAX_PAGES; >> + fc->max_nopage_rw = UINT_MAX; >> INIT_LIST_HEAD(&fc->mounts); >> list_add(&fm->fc_entry, &fc->mounts); >> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c >> index 5f1be1da92ce..3aac31d45198 100644 >> --- a/fs/fuse/virtio_fs.c >> +++ b/fs/fuse/virtio_fs.c >> @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct >> fs_context *fsc) >> /* Tell FUSE to split requests that exceed the virtqueue's size */ >> fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, >> virtqueue_size - FUSE_HEADER_OVERHEAD); >> + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer >> + * for fuse args, so limit the total size of these args to prevent >> + * the warning in __alloc_pages() and decrease the demand for large >> + * contiguous pages. >> + */ >> + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); >> fsc->s_fs_info = fm; >> sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); > .
On 1/10/24 02:16, Hou Tao wrote: > Hi, > > On 1/9/2024 9:11 PM, Bernd Schubert wrote: >> >> >> On 1/3/24 11:59, Hou Tao wrote: >>> From: Hou Tao <houtao1@huawei.com> >>> >>> When trying to insert a 10MB kernel module kept in a virtiofs with cache >>> disabled, the following warning was reported: >>> >>> ------------[ cut here ]------------ >>> WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... >>> Modules linked in: >>> CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 >>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... >>> RIP: 0010:__alloc_pages+0x2c4/0x360 >>> ...... >>> Call Trace: >>> <TASK> >>> ? __warn+0x8f/0x150 >>> ? __alloc_pages+0x2c4/0x360 >>> __kmalloc_large_node+0x86/0x160 >>> __kmalloc+0xcd/0x140 >>> virtio_fs_enqueue_req+0x240/0x6d0 >>> virtio_fs_wake_pending_and_unlock+0x7f/0x190 >>> queue_request_and_unlock+0x58/0x70 >>> fuse_simple_request+0x18b/0x2e0 >>> fuse_direct_io+0x58a/0x850 >>> fuse_file_read_iter+0xdb/0x130 >>> __kernel_read+0xf3/0x260 >>> kernel_read+0x45/0x60 >>> kernel_read_file+0x1ad/0x2b0 >>> init_module_from_file+0x6a/0xe0 >>> idempotent_init_module+0x179/0x230 >>> __x64_sys_finit_module+0x5d/0xb0 >>> do_syscall_64+0x36/0xb0 >>> entry_SYSCALL_64_after_hwframe+0x6e/0x76 >>> ...... >>> </TASK> >>> ---[ end trace 0000000000000000 ]--- >>> >>> The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses >>> kmalloc-ed memory as bound buffer for fuse args, but >>> fuse_get_user_pages() only limits the length of fuse arg by max_read or >>> max_write for IOV_KVEC io (e.g., kernel_read_file from finit_module()). >>> For virtiofs, max_read is UINT_MAX, so a big read request which is about >> >> >> I find this part of the explanation a bit confusing. I guess you >> wanted to write something like >> >> fuse_direct_io() -> fuse_get_user_pages() is limited by >> fc->max_write/fc->max_read and fc->max_pages. For virtiofs max_pages >> does not apply as ITER_KVEC is used. As virtiofs sets fc->max_read to >> UINT_MAX basically no limit is applied at all. > > Yes, what you said is just as expected but it is not the root cause of > the warning. The culprit of the warning is kmalloc() in > copy_args_to_argbuf() just as said in commit message. vmalloc() is also > not acceptable, because the physical memory needs to be contiguous. For > the problem, because there is no page involved, so there will be extra > sg available, maybe we can use these sg to break the big read/write > request into page. Hmm ok, I was hoping that contiguous memory is not needed. I see that ENOMEM is handled, but how that that perform (or even complete) on a really badly fragmented system? I guess splitting into smaller pages or at least adding some reserve kmem_cache (or even mempool) would make sense? >> >> I also wonder if it wouldn't it make sense to set a sensible limit in >> virtio_fs_ctx_set_defaults() instead of introducing a new variable? > > As said in the commit message: > > A feasible solution is to limit the value of max_read for virtiofs, so > the length passed to kmalloc() will be limited. However it will affects > the max read size for ITER_IOVEC io and the value of max_write also needs > limitation. > > It is a bit hard to set a reasonable value for both max_read and > max_write to handle both normal ITER_IOVEC io and ITER_KVEC io. And > considering ITER_KVEC io + dio case is uncommon, I think using a new > limitation is more reasonable. For ITER_IOVEC max_pages applies - which is limited to FUSE_MAX_MAX_PAGES - why can't this be used in virtio_fs_ctx_set_defaults? @Miklos, is there a reason why there is no upper fc->max_{read,write} limit in process_init_reply()? Shouldn't both be limited to (FUSE_MAX_MAX_PAGES * PAGE_SIZE). Or any other reasonable limit? Thanks, Bernd >> >> Also, I guess the issue is kmalloc_array() in virtio_fs_enqueue_req? >> Wouldn't it make sense to use kvm_alloc_array/kvfree in that function? >> >> >> Thanks, >> Bernd >> >> >>> 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn >>> with len=10MB, and triggers the warning in __alloc_pages(): >>> WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). >>> >>> A feasible solution is to limit the value of max_read for virtiofs, so >>> the length passed to kmalloc() will be limited. However it will affects >>> the max read size for ITER_IOVEC io and the value of max_write also >>> needs >>> limitation. So instead of limiting the values of max_read and max_write, >>> introducing max_nopage_rw to cap both the values of max_read and >>> max_write when the fuse dio read/write request is initiated from kernel. >>> >>> Considering that fuse read/write request from kernel is uncommon and to >>> decrease the demand for large contiguous pages, set max_nopage_rw as >>> 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. >>> >>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") >>> Signed-off-by: Hou Tao <houtao1@huawei.com> >>> --- >>> fs/fuse/file.c | 12 +++++++++++- >>> fs/fuse/fuse_i.h | 3 +++ >>> fs/fuse/inode.c | 1 + >>> fs/fuse/virtio_fs.c | 6 ++++++ >>> 4 files changed, 21 insertions(+), 1 deletion(-) >>> >>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >>> index a660f1f21540..f1beb7c0b782 100644 >>> --- a/fs/fuse/file.c >>> +++ b/fs/fuse/file.c >>> @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct >>> fuse_args_pages *ap, struct iov_iter *ii, >>> return ret < 0 ? ret : 0; >>> } >>> +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, >>> + const struct iov_iter *iter, int write) >>> +{ >>> + unsigned int nmax = write ? fc->max_write : fc->max_read; >>> + >>> + if (iov_iter_is_kvec(iter)) >>> + nmax = min(nmax, fc->max_nopage_rw); >>> + return nmax; >>> +} >>> + >>> ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, >>> loff_t *ppos, int flags) >>> { >>> @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, >>> struct iov_iter *iter, >>> struct inode *inode = mapping->host; >>> struct fuse_file *ff = file->private_data; >>> struct fuse_conn *fc = ff->fm->fc; >>> - size_t nmax = write ? fc->max_write : fc->max_read; >>> + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); >>> loff_t pos = *ppos; >>> size_t count = iov_iter_count(iter); >>> pgoff_t idx_from = pos >> PAGE_SHIFT; >>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >>> index 1df83eebda92..fc753cd34211 100644 >>> --- a/fs/fuse/fuse_i.h >>> +++ b/fs/fuse/fuse_i.h >>> @@ -594,6 +594,9 @@ struct fuse_conn { >>> /** Constrain ->max_pages to this value during feature >>> negotiation */ >>> unsigned int max_pages_limit; >>> + /** Maximum read/write size when there is no page in request */ >>> + unsigned int max_nopage_rw; >>> + >>> /** Input queue */ >>> struct fuse_iqueue iq; >>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >>> index 2a6d44f91729..4cbbcb4a4b71 100644 >>> --- a/fs/fuse/inode.c >>> +++ b/fs/fuse/inode.c >>> @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct >>> fuse_mount *fm, >>> fc->user_ns = get_user_ns(user_ns); >>> fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; >>> fc->max_pages_limit = FUSE_MAX_MAX_PAGES; >>> + fc->max_nopage_rw = UINT_MAX; >>> INIT_LIST_HEAD(&fc->mounts); >>> list_add(&fm->fc_entry, &fc->mounts); >>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c >>> index 5f1be1da92ce..3aac31d45198 100644 >>> --- a/fs/fuse/virtio_fs.c >>> +++ b/fs/fuse/virtio_fs.c >>> @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct >>> fs_context *fsc) >>> /* Tell FUSE to split requests that exceed the virtqueue's size */ >>> fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, >>> virtqueue_size - FUSE_HEADER_OVERHEAD); >>> + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer >>> + * for fuse args, so limit the total size of these args to prevent >>> + * the warning in __alloc_pages() and decrease the demand for large >>> + * contiguous pages. >>> + */ >>> + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); >>> fsc->s_fs_info = fm; >>> sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); >> . > >
Hi, On 1/11/2024 6:34 AM, Bernd Schubert wrote: > > > On 1/10/24 02:16, Hou Tao wrote: >> Hi, >> >> On 1/9/2024 9:11 PM, Bernd Schubert wrote: >>> >>> >>> On 1/3/24 11:59, Hou Tao wrote: >>>> From: Hou Tao <houtao1@huawei.com> >>>> >>>> When trying to insert a 10MB kernel module kept in a virtiofs with >>>> cache >>>> disabled, the following warning was reported: >>>> >>>> ------------[ cut here ]------------ >>>> WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... >>>> Modules linked in: >>>> CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 >>>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... >>>> RIP: 0010:__alloc_pages+0x2c4/0x360 >>>> ...... >>>> Call Trace: >>>> <TASK> >>>> ? __warn+0x8f/0x150 >>>> ? __alloc_pages+0x2c4/0x360 >>>> __kmalloc_large_node+0x86/0x160 >>>> __kmalloc+0xcd/0x140 >>>> virtio_fs_enqueue_req+0x240/0x6d0 >>>> virtio_fs_wake_pending_and_unlock+0x7f/0x190 >>>> queue_request_and_unlock+0x58/0x70 >>>> fuse_simple_request+0x18b/0x2e0 >>>> fuse_direct_io+0x58a/0x850 >>>> fuse_file_read_iter+0xdb/0x130 >>>> __kernel_read+0xf3/0x260 >>>> kernel_read+0x45/0x60 >>>> kernel_read_file+0x1ad/0x2b0 >>>> init_module_from_file+0x6a/0xe0 >>>> idempotent_init_module+0x179/0x230 >>>> __x64_sys_finit_module+0x5d/0xb0 >>>> do_syscall_64+0x36/0xb0 >>>> entry_SYSCALL_64_after_hwframe+0x6e/0x76 >>>> ...... >>>> </TASK> >>>> ---[ end trace 0000000000000000 ]--- >>>> >>>> The warning happened as follow. In copy_args_to_argbuf(), virtiofs >>>> uses >>>> kmalloc-ed memory as bound buffer for fuse args, but >>>> fuse_get_user_pages() only limits the length of fuse arg by >>>> max_read or >>>> max_write for IOV_KVEC io (e.g., kernel_read_file from >>>> finit_module()). >>>> For virtiofs, max_read is UINT_MAX, so a big read request which is >>>> about >>> >>> >>> I find this part of the explanation a bit confusing. I guess you >>> wanted to write something like >>> >>> fuse_direct_io() -> fuse_get_user_pages() is limited by >>> fc->max_write/fc->max_read and fc->max_pages. For virtiofs max_pages >>> does not apply as ITER_KVEC is used. As virtiofs sets fc->max_read to >>> UINT_MAX basically no limit is applied at all. >> >> Yes, what you said is just as expected but it is not the root cause of >> the warning. The culprit of the warning is kmalloc() in >> copy_args_to_argbuf() just as said in commit message. vmalloc() is also >> not acceptable, because the physical memory needs to be contiguous. For >> the problem, because there is no page involved, so there will be extra >> sg available, maybe we can use these sg to break the big read/write >> request into page. > > Hmm ok, I was hoping that contiguous memory is not needed. > I see that ENOMEM is handled, but how that that perform (or even > complete) on a really badly fragmented system? I guess splitting into > smaller pages or at least adding some reserve kmem_cache (or even > mempool) would make sense? I don't think using kmem_cache will help, because direct IO initiated from kernel (ITER_KVEC io) needs big and contiguous memory chunk. I have written a draft patch in which it breaks the ITER_KVEC chunk into pages, uses these pages to initialize extra sgs and passes it to virtiofsd. It works but it is a bit complicated and I am not sure whether it is worthy the complexity. Anyway, I will beautify it and post it as v2. > >>> >>> I also wonder if it wouldn't it make sense to set a sensible limit in >>> virtio_fs_ctx_set_defaults() instead of introducing a new variable? >> >> As said in the commit message: >> >> A feasible solution is to limit the value of max_read for virtiofs, so >> the length passed to kmalloc() will be limited. However it will affects >> the max read size for ITER_IOVEC io and the value of max_write also >> needs >> limitation. >> >> It is a bit hard to set a reasonable value for both max_read and >> max_write to handle both normal ITER_IOVEC io and ITER_KVEC io. And >> considering ITER_KVEC io + dio case is uncommon, I think using a new >> limitation is more reasonable. > > For ITER_IOVEC max_pages applies - which is limited to > FUSE_MAX_MAX_PAGES - why can't this be used in > virtio_fs_ctx_set_defaults? It won't help too much. Under x86-64, max_read will be 256 * 4KB = 1MB, so it will try to do kmalloc(1MB, GFP_ATOMIC) and I think it still creates too much memory pressure for the system. > > @Miklos, is there a reason why there is no upper fc->max_{read,write} > limit in process_init_reply()? Shouldn't both be limited to > (FUSE_MAX_MAX_PAGES * PAGE_SIZE). Or any other reasonable limit? It seems that for all other read/write requests beside ITER_IOVEC direct io, max_pages_limit is honored implicitly. > > > Thanks, > Bernd > > > >>> >>> Also, I guess the issue is kmalloc_array() in virtio_fs_enqueue_req? >>> Wouldn't it make sense to use kvm_alloc_array/kvfree in that function? >>> >>> >>> Thanks, >>> Bernd >>> >>> >>>> 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn >>>> with len=10MB, and triggers the warning in __alloc_pages(): >>>> WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). >>>> >>>> A feasible solution is to limit the value of max_read for virtiofs, so >>>> the length passed to kmalloc() will be limited. However it will >>>> affects >>>> the max read size for ITER_IOVEC io and the value of max_write also >>>> needs >>>> limitation. So instead of limiting the values of max_read and >>>> max_write, >>>> introducing max_nopage_rw to cap both the values of max_read and >>>> max_write when the fuse dio read/write request is initiated from >>>> kernel. >>>> >>>> Considering that fuse read/write request from kernel is uncommon >>>> and to >>>> decrease the demand for large contiguous pages, set max_nopage_rw as >>>> 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. >>>> >>>> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") >>>> Signed-off-by: Hou Tao <houtao1@huawei.com> >>>> --- >>>> fs/fuse/file.c | 12 +++++++++++- >>>> fs/fuse/fuse_i.h | 3 +++ >>>> fs/fuse/inode.c | 1 + >>>> fs/fuse/virtio_fs.c | 6 ++++++ >>>> 4 files changed, 21 insertions(+), 1 deletion(-) >>>> >>>> diff --git a/fs/fuse/file.c b/fs/fuse/file.c >>>> index a660f1f21540..f1beb7c0b782 100644 >>>> --- a/fs/fuse/file.c >>>> +++ b/fs/fuse/file.c >>>> @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct >>>> fuse_args_pages *ap, struct iov_iter *ii, >>>> return ret < 0 ? ret : 0; >>>> } >>>> +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, >>>> + const struct iov_iter *iter, int write) >>>> +{ >>>> + unsigned int nmax = write ? fc->max_write : fc->max_read; >>>> + >>>> + if (iov_iter_is_kvec(iter)) >>>> + nmax = min(nmax, fc->max_nopage_rw); >>>> + return nmax; >>>> +} >>>> + >>>> ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter >>>> *iter, >>>> loff_t *ppos, int flags) >>>> { >>>> @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, >>>> struct iov_iter *iter, >>>> struct inode *inode = mapping->host; >>>> struct fuse_file *ff = file->private_data; >>>> struct fuse_conn *fc = ff->fm->fc; >>>> - size_t nmax = write ? fc->max_write : fc->max_read; >>>> + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); >>>> loff_t pos = *ppos; >>>> size_t count = iov_iter_count(iter); >>>> pgoff_t idx_from = pos >> PAGE_SHIFT; >>>> diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h >>>> index 1df83eebda92..fc753cd34211 100644 >>>> --- a/fs/fuse/fuse_i.h >>>> +++ b/fs/fuse/fuse_i.h >>>> @@ -594,6 +594,9 @@ struct fuse_conn { >>>> /** Constrain ->max_pages to this value during feature >>>> negotiation */ >>>> unsigned int max_pages_limit; >>>> + /** Maximum read/write size when there is no page in >>>> request */ >>>> + unsigned int max_nopage_rw; >>>> + >>>> /** Input queue */ >>>> struct fuse_iqueue iq; >>>> diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c >>>> index 2a6d44f91729..4cbbcb4a4b71 100644 >>>> --- a/fs/fuse/inode.c >>>> +++ b/fs/fuse/inode.c >>>> @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct >>>> fuse_mount *fm, >>>> fc->user_ns = get_user_ns(user_ns); >>>> fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; >>>> fc->max_pages_limit = FUSE_MAX_MAX_PAGES; >>>> + fc->max_nopage_rw = UINT_MAX; >>>> INIT_LIST_HEAD(&fc->mounts); >>>> list_add(&fm->fc_entry, &fc->mounts); >>>> diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c >>>> index 5f1be1da92ce..3aac31d45198 100644 >>>> --- a/fs/fuse/virtio_fs.c >>>> +++ b/fs/fuse/virtio_fs.c >>>> @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct >>>> fs_context *fsc) >>>> /* Tell FUSE to split requests that exceed the virtqueue's >>>> size */ >>>> fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, >>>> virtqueue_size - FUSE_HEADER_OVERHEAD); >>>> + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer >>>> + * for fuse args, so limit the total size of these args to >>>> prevent >>>> + * the warning in __alloc_pages() and decrease the demand for >>>> large >>>> + * contiguous pages. >>>> + */ >>>> + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); >>>> fsc->s_fs_info = fm; >>>> sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); >>> . >> >>
On Wed, Jan 03, 2024 at 06:59:29PM +0800, Hou Tao wrote: > From: Hou Tao <houtao1@huawei.com> > > When trying to insert a 10MB kernel module kept in a virtiofs with cache > disabled, the following warning was reported: > > ------------[ cut here ]------------ > WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... > Modules linked in: > CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... > RIP: 0010:__alloc_pages+0x2c4/0x360 > ...... > Call Trace: > <TASK> > ? __warn+0x8f/0x150 > ? __alloc_pages+0x2c4/0x360 > __kmalloc_large_node+0x86/0x160 > __kmalloc+0xcd/0x140 > virtio_fs_enqueue_req+0x240/0x6d0 > virtio_fs_wake_pending_and_unlock+0x7f/0x190 > queue_request_and_unlock+0x58/0x70 > fuse_simple_request+0x18b/0x2e0 > fuse_direct_io+0x58a/0x850 > fuse_file_read_iter+0xdb/0x130 > __kernel_read+0xf3/0x260 > kernel_read+0x45/0x60 > kernel_read_file+0x1ad/0x2b0 > init_module_from_file+0x6a/0xe0 > idempotent_init_module+0x179/0x230 > __x64_sys_finit_module+0x5d/0xb0 > do_syscall_64+0x36/0xb0 > entry_SYSCALL_64_after_hwframe+0x6e/0x76 > ...... > </TASK> > ---[ end trace 0000000000000000 ]--- > > The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses > kmalloc-ed memory as bound buffer for fuse args, but > fuse_get_user_pages() only limits the length of fuse arg by max_read or > max_write for IOV_KVEC io (e.g., kernel_read_file from finit_module()). > For virtiofs, max_read is UINT_MAX, so a big read request which is about > 10MB is passed to copy_args_to_argbuf(), kmalloc() is called in turn > with len=10MB, and triggers the warning in __alloc_pages(): > WARN_ON_ONCE_GFP(order > MAX_ORDER, gfp)). > > A feasible solution is to limit the value of max_read for virtiofs, so > the length passed to kmalloc() will be limited. However it will affects > the max read size for ITER_IOVEC io and the value of max_write also needs > limitation. So instead of limiting the values of max_read and max_write, > introducing max_nopage_rw to cap both the values of max_read and > max_write when the fuse dio read/write request is initiated from kernel. > > Considering that fuse read/write request from kernel is uncommon and to > decrease the demand for large contiguous pages, set max_nopage_rw as > 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. > > Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") > Signed-off-by: Hou Tao <houtao1@huawei.com> So what should I do with this patch? It includes fuse changes but of course I can merge too if no one wants to bother either way... > --- > fs/fuse/file.c | 12 +++++++++++- > fs/fuse/fuse_i.h | 3 +++ > fs/fuse/inode.c | 1 + > fs/fuse/virtio_fs.c | 6 ++++++ > 4 files changed, 21 insertions(+), 1 deletion(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index a660f1f21540..f1beb7c0b782 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, > return ret < 0 ? ret : 0; > } > > +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, > + const struct iov_iter *iter, int write) > +{ > + unsigned int nmax = write ? fc->max_write : fc->max_read; > + > + if (iov_iter_is_kvec(iter)) > + nmax = min(nmax, fc->max_nopage_rw); > + return nmax; > +} > + > ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > loff_t *ppos, int flags) > { > @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, > struct inode *inode = mapping->host; > struct fuse_file *ff = file->private_data; > struct fuse_conn *fc = ff->fm->fc; > - size_t nmax = write ? fc->max_write : fc->max_read; > + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); > loff_t pos = *ppos; > size_t count = iov_iter_count(iter); > pgoff_t idx_from = pos >> PAGE_SHIFT; > diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h > index 1df83eebda92..fc753cd34211 100644 > --- a/fs/fuse/fuse_i.h > +++ b/fs/fuse/fuse_i.h > @@ -594,6 +594,9 @@ struct fuse_conn { > /** Constrain ->max_pages to this value during feature negotiation */ > unsigned int max_pages_limit; > > + /** Maximum read/write size when there is no page in request */ > + unsigned int max_nopage_rw; > + > /** Input queue */ > struct fuse_iqueue iq; > > diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c > index 2a6d44f91729..4cbbcb4a4b71 100644 > --- a/fs/fuse/inode.c > +++ b/fs/fuse/inode.c > @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, > fc->user_ns = get_user_ns(user_ns); > fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; > fc->max_pages_limit = FUSE_MAX_MAX_PAGES; > + fc->max_nopage_rw = UINT_MAX; > > INIT_LIST_HEAD(&fc->mounts); > list_add(&fm->fc_entry, &fc->mounts); > diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c > index 5f1be1da92ce..3aac31d45198 100644 > --- a/fs/fuse/virtio_fs.c > +++ b/fs/fuse/virtio_fs.c > @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct fs_context *fsc) > /* Tell FUSE to split requests that exceed the virtqueue's size */ > fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, > virtqueue_size - FUSE_HEADER_OVERHEAD); > + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer > + * for fuse args, so limit the total size of these args to prevent > + * the warning in __alloc_pages() and decrease the demand for large > + * contiguous pages. > + */ > + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); > > fsc->s_fs_info = fm; > sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); > -- > 2.29.2
Hi, On 2/23/2024 3:49 AM, Michael S. Tsirkin wrote: > On Wed, Jan 03, 2024 at 06:59:29PM +0800, Hou Tao wrote: >> From: Hou Tao <houtao1@huawei.com> >> >> When trying to insert a 10MB kernel module kept in a virtiofs with cache >> disabled, the following warning was reported: >> SNIP >> >> A feasible solution is to limit the value of max_read for virtiofs, so >> the length passed to kmalloc() will be limited. However it will affects >> the max read size for ITER_IOVEC io and the value of max_write also needs >> limitation. So instead of limiting the values of max_read and max_write, >> introducing max_nopage_rw to cap both the values of max_read and >> max_write when the fuse dio read/write request is initiated from kernel. >> >> Considering that fuse read/write request from kernel is uncommon and to >> decrease the demand for large contiguous pages, set max_nopage_rw as >> 256KB instead of KMALLOC_MAX_SIZE - 4096 or similar. >> >> Fixes: a62a8ef9d97d ("virtio-fs: add virtiofs filesystem") >> Signed-off-by: Hou Tao <houtao1@huawei.com> > > So what should I do with this patch? It includes fuse changes > but of course I can merge too if no one wants to bother either way... The patch had got some feedback from Bernd Schubert . And I will post v2 before next Thursday.
On Wed, 3 Jan 2024 at 11:58, Hou Tao <houtao@huaweicloud.com> wrote: > > From: Hou Tao <houtao1@huawei.com> > > When trying to insert a 10MB kernel module kept in a virtiofs with cache > disabled, the following warning was reported: > > ------------[ cut here ]------------ > WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... > Modules linked in: > CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... > RIP: 0010:__alloc_pages+0x2c4/0x360 > ...... > Call Trace: > <TASK> > ? __warn+0x8f/0x150 > ? __alloc_pages+0x2c4/0x360 > __kmalloc_large_node+0x86/0x160 > __kmalloc+0xcd/0x140 > virtio_fs_enqueue_req+0x240/0x6d0 > virtio_fs_wake_pending_and_unlock+0x7f/0x190 > queue_request_and_unlock+0x58/0x70 > fuse_simple_request+0x18b/0x2e0 > fuse_direct_io+0x58a/0x850 > fuse_file_read_iter+0xdb/0x130 > __kernel_read+0xf3/0x260 > kernel_read+0x45/0x60 > kernel_read_file+0x1ad/0x2b0 > init_module_from_file+0x6a/0xe0 > idempotent_init_module+0x179/0x230 > __x64_sys_finit_module+0x5d/0xb0 > do_syscall_64+0x36/0xb0 > entry_SYSCALL_64_after_hwframe+0x6e/0x76 > ...... > </TASK> > ---[ end trace 0000000000000000 ]--- > > The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses > kmalloc-ed memory as bound buffer for fuse args, but So this seems to be the special case in fuse_get_user_pages() when the read/write requests get a piece of kernel memory. I don't really understand the comment in virtio_fs_enqueue_req(): /* Use a bounce buffer since stack args cannot be mapped */ Stefan, can you explain? What's special about the arg being on the stack? What if the arg is not on the stack (as is probably the case for big args like this)? Do we need the bounce buffer in that case? Thanks, Miklos
Hi, On 2/23/2024 5:42 PM, Miklos Szeredi wrote: > On Wed, 3 Jan 2024 at 11:58, Hou Tao <houtao@huaweicloud.com> wrote: >> From: Hou Tao <houtao1@huawei.com> >> >> When trying to insert a 10MB kernel module kept in a virtiofs with cache >> disabled, the following warning was reported: >> >> ------------[ cut here ]------------ >> WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... >> Modules linked in: >> CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 >> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... >> RIP: 0010:__alloc_pages+0x2c4/0x360 >> ...... >> Call Trace: >> <TASK> >> ? __warn+0x8f/0x150 >> ? __alloc_pages+0x2c4/0x360 >> __kmalloc_large_node+0x86/0x160 >> __kmalloc+0xcd/0x140 >> virtio_fs_enqueue_req+0x240/0x6d0 >> virtio_fs_wake_pending_and_unlock+0x7f/0x190 >> queue_request_and_unlock+0x58/0x70 >> fuse_simple_request+0x18b/0x2e0 >> fuse_direct_io+0x58a/0x850 >> fuse_file_read_iter+0xdb/0x130 >> __kernel_read+0xf3/0x260 >> kernel_read+0x45/0x60 >> kernel_read_file+0x1ad/0x2b0 >> init_module_from_file+0x6a/0xe0 >> idempotent_init_module+0x179/0x230 >> __x64_sys_finit_module+0x5d/0xb0 >> do_syscall_64+0x36/0xb0 >> entry_SYSCALL_64_after_hwframe+0x6e/0x76 >> ...... >> </TASK> >> ---[ end trace 0000000000000000 ]--- >> >> The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses >> kmalloc-ed memory as bound buffer for fuse args, but > So this seems to be the special case in fuse_get_user_pages() when the > read/write requests get a piece of kernel memory. > > I don't really understand the comment in virtio_fs_enqueue_req(): /* > Use a bounce buffer since stack args cannot be mapped */ > > Stefan, can you explain? What's special about the arg being on the stack? > > What if the arg is not on the stack (as is probably the case for big > args like this)? Do we need the bounce buffer in that case? I will try to answer these two questions. Correct me if I am wrong. The main reason for the bounce buffer is that virtiofs passes a scatter list to the virtiofsd through virtio eventually, so it needs to get the page (namely struct page) for these args. If the arg is placed in the stack, there is no way to get the page. For ITER_KVEC dio mentioned in the patch, the data buffer is still allocated through vmalloc(), so the bounce buffer is still necessary. > > Thanks, > Miklos
On Fri, Feb 23, 2024 at 10:42:37AM +0100, Miklos Szeredi wrote: > On Wed, 3 Jan 2024 at 11:58, Hou Tao <houtao@huaweicloud.com> wrote: > > > > From: Hou Tao <houtao1@huawei.com> > > > > When trying to insert a 10MB kernel module kept in a virtiofs with cache > > disabled, the following warning was reported: > > > > ------------[ cut here ]------------ > > WARNING: CPU: 2 PID: 439 at mm/page_alloc.c:4544 ...... > > Modules linked in: > > CPU: 2 PID: 439 Comm: insmod Not tainted 6.7.0-rc7+ #33 > > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), ...... > > RIP: 0010:__alloc_pages+0x2c4/0x360 > > ...... > > Call Trace: > > <TASK> > > ? __warn+0x8f/0x150 > > ? __alloc_pages+0x2c4/0x360 > > __kmalloc_large_node+0x86/0x160 > > __kmalloc+0xcd/0x140 > > virtio_fs_enqueue_req+0x240/0x6d0 > > virtio_fs_wake_pending_and_unlock+0x7f/0x190 > > queue_request_and_unlock+0x58/0x70 > > fuse_simple_request+0x18b/0x2e0 > > fuse_direct_io+0x58a/0x850 > > fuse_file_read_iter+0xdb/0x130 > > __kernel_read+0xf3/0x260 > > kernel_read+0x45/0x60 > > kernel_read_file+0x1ad/0x2b0 > > init_module_from_file+0x6a/0xe0 > > idempotent_init_module+0x179/0x230 > > __x64_sys_finit_module+0x5d/0xb0 > > do_syscall_64+0x36/0xb0 > > entry_SYSCALL_64_after_hwframe+0x6e/0x76 > > ...... > > </TASK> > > ---[ end trace 0000000000000000 ]--- > > > > The warning happened as follow. In copy_args_to_argbuf(), virtiofs uses > > kmalloc-ed memory as bound buffer for fuse args, but > > So this seems to be the special case in fuse_get_user_pages() when the > read/write requests get a piece of kernel memory. > > I don't really understand the comment in virtio_fs_enqueue_req(): /* > Use a bounce buffer since stack args cannot be mapped */ > > Stefan, can you explain? What's special about the arg being on the stack? virtio core wants DMA'able addresses. See Documentation/core-api/dma-api-howto.rst : ... This rule also means that you may use neither kernel image addresses (items in data/text/bss segments), nor module image addresses, nor stack addresses for DMA. > What if the arg is not on the stack (as is probably the case for big > args like this)? Do we need the bounce buffer in that case? > > Thanks, > Miklos
diff --git a/fs/fuse/file.c b/fs/fuse/file.c index a660f1f21540..f1beb7c0b782 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1422,6 +1422,16 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, return ret < 0 ? ret : 0; } +static size_t fuse_max_dio_rw_size(const struct fuse_conn *fc, + const struct iov_iter *iter, int write) +{ + unsigned int nmax = write ? fc->max_write : fc->max_read; + + if (iov_iter_is_kvec(iter)) + nmax = min(nmax, fc->max_nopage_rw); + return nmax; +} + ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, loff_t *ppos, int flags) { @@ -1432,7 +1442,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, struct inode *inode = mapping->host; struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; - size_t nmax = write ? fc->max_write : fc->max_read; + size_t nmax = fuse_max_dio_rw_size(fc, iter, write); loff_t pos = *ppos; size_t count = iov_iter_count(iter); pgoff_t idx_from = pos >> PAGE_SHIFT; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 1df83eebda92..fc753cd34211 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -594,6 +594,9 @@ struct fuse_conn { /** Constrain ->max_pages to this value during feature negotiation */ unsigned int max_pages_limit; + /** Maximum read/write size when there is no page in request */ + unsigned int max_nopage_rw; + /** Input queue */ struct fuse_iqueue iq; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2a6d44f91729..4cbbcb4a4b71 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -923,6 +923,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = FUSE_MAX_MAX_PAGES; + fc->max_nopage_rw = UINT_MAX; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 5f1be1da92ce..3aac31d45198 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1452,6 +1452,12 @@ static int virtio_fs_get_tree(struct fs_context *fsc) /* Tell FUSE to split requests that exceed the virtqueue's size */ fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, virtqueue_size - FUSE_HEADER_OVERHEAD); + /* copy_args_to_argbuf() uses kmalloc-ed memory as bounce buffer + * for fuse args, so limit the total size of these args to prevent + * the warning in __alloc_pages() and decrease the demand for large + * contiguous pages. + */ + fc->max_nopage_rw = min(fc->max_nopage_rw, 256U << 10); fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc);