diff mbox series

[1/3] mm: Add kvrealloc()

Message ID 20210630061431.1750745-2-david@fromorbit.com (mailing list archive)
State New
Headers show
Series mm, xfs: memory allocation improvements | expand

Commit Message

Dave Chinner June 30, 2021, 6:14 a.m. UTC
From: Dave Chinner <dchinner@redhat.com>

During log recovery of an XFS filesystem with 64kB directory
buffers, rebuilding a buffer split across two log records results
in a memory allocation warning from krealloc like this:

xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
XFS (dm-0): Unmounting Filesystem
XFS (dm-0): Mounting V5 Filesystem
XFS (dm-0): Starting recovery (logdev: internal)
------------[ cut here ]------------
WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40
.....
RIP: 0010:get_page_from_freelist+0xdee/0xe40
Call Trace:
 ? complete+0x3f/0x50
 __alloc_pages+0x16f/0x300
 alloc_pages+0x87/0x110
 kmalloc_order+0x2c/0x90
 kmalloc_order_trace+0x1d/0x90
 __kmalloc_track_caller+0x215/0x270
 ? xlog_recover_add_to_cont_trans+0x63/0x1f0
 krealloc+0x54/0xb0
 xlog_recover_add_to_cont_trans+0x63/0x1f0
 xlog_recovery_process_trans+0xc1/0xd0
 xlog_recover_process_ophdr+0x86/0x130
 xlog_recover_process_data+0x9f/0x160
 xlog_recover_process+0xa2/0x120
 xlog_do_recovery_pass+0x40b/0x7d0
 ? __irq_work_queue_local+0x4f/0x60
 ? irq_work_queue+0x3a/0x50
 xlog_do_log_recovery+0x70/0x150
 xlog_do_recover+0x38/0x1d0
 xlog_recover+0xd8/0x170
 xfs_log_mount+0x181/0x300
 xfs_mountfs+0x4a1/0x9b0
 xfs_fs_fill_super+0x3c0/0x7b0
 get_tree_bdev+0x171/0x270
 ? suffix_kstrtoint.constprop.0+0xf0/0xf0
 xfs_fs_get_tree+0x15/0x20
 vfs_get_tree+0x24/0xc0
 path_mount+0x2f5/0xaf0
 __x64_sys_mount+0x108/0x140
 do_syscall_64+0x3a/0x70
 entry_SYSCALL_64_after_hwframe+0x44/0xae

Essentially, we are taking a multi-order allocation from kmem_alloc()
(which has an open coded no fail, no warn loop) and then
reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is
then triggering the above warning.

This is a regression caused by converting this code from an open
coded no fail/no warn reallocation loop to using __GFP_NOFAIL.

What we actually need here is kvrealloc(), so that if contiguous
page allocation fails we fall back to vmalloc() and we don't
get nasty warnings happening in XFS.

Fixes: 771915c4f688 ("xfs: remove kmem_realloc()")
Signed-off-by: Dave Chinner <dchinner@redhat.com>
---
 fs/xfs/xfs_log_recover.c |  2 +-
 include/linux/mm.h       |  2 ++
 mm/util.c                | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)

Comments

Mel Gorman June 30, 2021, 10:04 a.m. UTC | #1
On Wed, Jun 30, 2021 at 04:14:29PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> During log recovery of an XFS filesystem with 64kB directory
> buffers, rebuilding a buffer split across two log records results
> in a memory allocation warning from krealloc like this:
> 
> xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
> XFS (dm-0): Unmounting Filesystem
> XFS (dm-0): Mounting V5 Filesystem
> XFS (dm-0): Starting recovery (logdev: internal)
> ------------[ cut here ]------------
> WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40
> .....
> RIP: 0010:get_page_from_freelist+0xdee/0xe40
> Call Trace:
>  ? complete+0x3f/0x50
>  __alloc_pages+0x16f/0x300
>  alloc_pages+0x87/0x110
>  kmalloc_order+0x2c/0x90
>  kmalloc_order_trace+0x1d/0x90
>  __kmalloc_track_caller+0x215/0x270
>  ? xlog_recover_add_to_cont_trans+0x63/0x1f0
>  krealloc+0x54/0xb0
>  xlog_recover_add_to_cont_trans+0x63/0x1f0
>  xlog_recovery_process_trans+0xc1/0xd0
>  xlog_recover_process_ophdr+0x86/0x130
>  xlog_recover_process_data+0x9f/0x160
>  xlog_recover_process+0xa2/0x120
>  xlog_do_recovery_pass+0x40b/0x7d0
>  ? __irq_work_queue_local+0x4f/0x60
>  ? irq_work_queue+0x3a/0x50
>  xlog_do_log_recovery+0x70/0x150
>  xlog_do_recover+0x38/0x1d0
>  xlog_recover+0xd8/0x170
>  xfs_log_mount+0x181/0x300
>  xfs_mountfs+0x4a1/0x9b0
>  xfs_fs_fill_super+0x3c0/0x7b0
>  get_tree_bdev+0x171/0x270
>  ? suffix_kstrtoint.constprop.0+0xf0/0xf0
>  xfs_fs_get_tree+0x15/0x20
>  vfs_get_tree+0x24/0xc0
>  path_mount+0x2f5/0xaf0
>  __x64_sys_mount+0x108/0x140
>  do_syscall_64+0x3a/0x70
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> 
> Essentially, we are taking a multi-order allocation from kmem_alloc()
> (which has an open coded no fail, no warn loop) and then
> reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is
> then triggering the above warning.
> 
> This is a regression caused by converting this code from an open
> coded no fail/no warn reallocation loop to using __GFP_NOFAIL.
> 
> What we actually need here is kvrealloc(), so that if contiguous
> page allocation fails we fall back to vmalloc() and we don't
> get nasty warnings happening in XFS.
> 
> Fixes: 771915c4f688 ("xfs: remove kmem_realloc()")
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log_recover.c |  2 +-
>  include/linux/mm.h       |  2 ++
>  mm/util.c                | 15 +++++++++++++++
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 1721fce2ec94..fee4fbadea0a 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -2062,7 +2062,7 @@ xlog_recover_add_to_cont_trans(
>  	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
>  	old_len = item->ri_buf[item->ri_cnt-1].i_len;
>  
> -	ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
> +	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
>  	memcpy(&ptr[old_len], dp, len);
>  	item->ri_buf[item->ri_cnt-1].i_len += len;
>  	item->ri_buf[item->ri_cnt-1].i_addr = ptr;

While this is an improvement, note that this still potentially fail if,
for example, a vm_struct cannot be allocated for the vmalloc area,
a virtual range is not available, or the pages cannot be allocated
to back the vmalloc range. The last one is potentially problematic if
you look at __vmalloc_node_range and __vmalloc_area_node. vmalloc has
changed a lot since I last familiar with the code but I think it should
not attempt a high-order allocation for 64K but __vmalloc_area_node will
call alloc_pages_node with a GFP mask without __GFP_NOFAIL. In most cases,
it may still succeed but it could fail if the system is OOM. Did I miss
something?

I didn't do a full audit to determine what fallout, if any, there is
to ultimately passing __GFP_NOFAIL to kvmalloc although kvmalloc_node
explicitly notes that __GFP_NOFAIL is not supported. Adding Michal Hocko
to the cc to see if he remembers why __GFP_NOFAIL was problematic.

Absent being able to pass in __GFP_NOFAIL to kvmalloc_node, the new
helper kvrealloc may need to understand __GFP_NOFAIL, avoid passing it
to kvmalloc and indefintiely retry instead to avoid an XFS log recovery
hitting a NULL pointer exception when the memcpy is tried :(
Michal Hocko June 30, 2021, 12:05 p.m. UTC | #2
On Wed 30-06-21 11:04:55, Mel Gorman wrote:
[...]
> I didn't do a full audit to determine what fallout, if any, there is
> to ultimately passing __GFP_NOFAIL to kvmalloc although kvmalloc_node
> explicitly notes that __GFP_NOFAIL is not supported. Adding Michal Hocko
> to the cc to see if he remembers why __GFP_NOFAIL was problematic.

This is because there are allocations in the vmalloc path which do not
get the full gfp context. E.g. page table allocations. This could be
likely handled somewhere at the vmalloc layer and retry but I do not
remember anybody would be really requesting the support.
Darrick J. Wong June 30, 2021, 4:08 p.m. UTC | #3
On Wed, Jun 30, 2021 at 04:14:29PM +1000, Dave Chinner wrote:
> From: Dave Chinner <dchinner@redhat.com>
> 
> During log recovery of an XFS filesystem with 64kB directory
> buffers, rebuilding a buffer split across two log records results
> in a memory allocation warning from krealloc like this:
> 
> xfs filesystem being mounted at /mnt/scratch supports timestamps until 2038 (0x7fffffff)
> XFS (dm-0): Unmounting Filesystem
> XFS (dm-0): Mounting V5 Filesystem
> XFS (dm-0): Starting recovery (logdev: internal)
> ------------[ cut here ]------------
> WARNING: CPU: 5 PID: 3435170 at mm/page_alloc.c:3539 get_page_from_freelist+0xdee/0xe40
> .....
> RIP: 0010:get_page_from_freelist+0xdee/0xe40
> Call Trace:
>  ? complete+0x3f/0x50
>  __alloc_pages+0x16f/0x300
>  alloc_pages+0x87/0x110
>  kmalloc_order+0x2c/0x90
>  kmalloc_order_trace+0x1d/0x90
>  __kmalloc_track_caller+0x215/0x270
>  ? xlog_recover_add_to_cont_trans+0x63/0x1f0
>  krealloc+0x54/0xb0
>  xlog_recover_add_to_cont_trans+0x63/0x1f0
>  xlog_recovery_process_trans+0xc1/0xd0
>  xlog_recover_process_ophdr+0x86/0x130
>  xlog_recover_process_data+0x9f/0x160
>  xlog_recover_process+0xa2/0x120
>  xlog_do_recovery_pass+0x40b/0x7d0
>  ? __irq_work_queue_local+0x4f/0x60
>  ? irq_work_queue+0x3a/0x50
>  xlog_do_log_recovery+0x70/0x150
>  xlog_do_recover+0x38/0x1d0
>  xlog_recover+0xd8/0x170
>  xfs_log_mount+0x181/0x300
>  xfs_mountfs+0x4a1/0x9b0
>  xfs_fs_fill_super+0x3c0/0x7b0
>  get_tree_bdev+0x171/0x270
>  ? suffix_kstrtoint.constprop.0+0xf0/0xf0
>  xfs_fs_get_tree+0x15/0x20
>  vfs_get_tree+0x24/0xc0
>  path_mount+0x2f5/0xaf0
>  __x64_sys_mount+0x108/0x140
>  do_syscall_64+0x3a/0x70
>  entry_SYSCALL_64_after_hwframe+0x44/0xae
> 
> Essentially, we are taking a multi-order allocation from kmem_alloc()
> (which has an open coded no fail, no warn loop) and then
> reallocating it out to 64kB using krealloc(__GFP_NOFAIL) and that is
> then triggering the above warning.
> 
> This is a regression caused by converting this code from an open
> coded no fail/no warn reallocation loop to using __GFP_NOFAIL.
> 
> What we actually need here is kvrealloc(), so that if contiguous
> page allocation fails we fall back to vmalloc() and we don't
> get nasty warnings happening in XFS.
> 
> Fixes: 771915c4f688 ("xfs: remove kmem_realloc()")
> Signed-off-by: Dave Chinner <dchinner@redhat.com>
> ---
>  fs/xfs/xfs_log_recover.c |  2 +-
>  include/linux/mm.h       |  2 ++
>  mm/util.c                | 15 +++++++++++++++
>  3 files changed, 18 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> index 1721fce2ec94..fee4fbadea0a 100644
> --- a/fs/xfs/xfs_log_recover.c
> +++ b/fs/xfs/xfs_log_recover.c
> @@ -2062,7 +2062,7 @@ xlog_recover_add_to_cont_trans(
>  	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
>  	old_len = item->ri_buf[item->ri_cnt-1].i_len;
>  
> -	ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
> +	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);

kvrealloc can return null, so this needs to check for that and -ENOMEM,
right?  It'll suck that log recovery fails, but such is life.

--D

>  	memcpy(&ptr[old_len], dp, len);
>  	item->ri_buf[item->ri_cnt-1].i_len += len;
>  	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index 8ae31622deef..aa720bc0a1d0 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -827,6 +827,8 @@ static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
>  	return kvmalloc_array(n, size, flags | __GFP_ZERO);
>  }
>  
> +extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
> +		gfp_t flags);
>  extern void kvfree(const void *addr);
>  extern void kvfree_sensitive(const void *addr, size_t len);
>  
> diff --git a/mm/util.c b/mm/util.c
> index a8bf17f18a81..1104339ad2ca 100644
> --- a/mm/util.c
> +++ b/mm/util.c
> @@ -635,6 +635,21 @@ void kvfree_sensitive(const void *addr, size_t len)
>  }
>  EXPORT_SYMBOL(kvfree_sensitive);
>  
> +void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
> +{
> +	void *newp;
> +
> +	if (oldsize >= newsize)
> +		return (void *)p;
> +	newp = kvmalloc(newsize, flags);
> +	if (!newp)
> +		return NULL;
> +	memcpy(newp, p, oldsize);
> +	kvfree(p);
> +	return newp;
> +}
> +EXPORT_SYMBOL(kvrealloc);
> +
>  static inline void *__page_rmapping(struct page *page)
>  {
>  	unsigned long mapping;
> -- 
> 2.31.1
>
Dave Chinner June 30, 2021, 9:49 p.m. UTC | #4
On Wed, Jun 30, 2021 at 09:08:43AM -0700, Darrick J. Wong wrote:
> On Wed, Jun 30, 2021 at 04:14:29PM +1000, Dave Chinner wrote:
> > From: Dave Chinner <dchinner@redhat.com>
> > diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
> > index 1721fce2ec94..fee4fbadea0a 100644
> > --- a/fs/xfs/xfs_log_recover.c
> > +++ b/fs/xfs/xfs_log_recover.c
> > @@ -2062,7 +2062,7 @@ xlog_recover_add_to_cont_trans(
> >  	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
> >  	old_len = item->ri_buf[item->ri_cnt-1].i_len;
> >  
> > -	ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
> > +	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
> 
> kvrealloc can return null, so this needs to check for that and -ENOMEM,
> right?  It'll suck that log recovery fails, but such is life.

Ok, looking through the code it seems that returning -ENOMEM here
is a non-destructive (i.e. retry-able) failure. It will simply stop
processing the log at the point this occurs and abort all pending
objects that haven't been processed. The error message is less than
stellar ("log mount/recovery failed: error %d") but at least no
other damage will be done. I'll update it.

Cheers,

Dave.
diff mbox series

Patch

diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 1721fce2ec94..fee4fbadea0a 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2062,7 +2062,7 @@  xlog_recover_add_to_cont_trans(
 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
 
-	ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
+	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
 	memcpy(&ptr[old_len], dp, len);
 	item->ri_buf[item->ri_cnt-1].i_len += len;
 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ae31622deef..aa720bc0a1d0 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -827,6 +827,8 @@  static inline void *kvcalloc(size_t n, size_t size, gfp_t flags)
 	return kvmalloc_array(n, size, flags | __GFP_ZERO);
 }
 
+extern void *kvrealloc(const void *p, size_t oldsize, size_t newsize,
+		gfp_t flags);
 extern void kvfree(const void *addr);
 extern void kvfree_sensitive(const void *addr, size_t len);
 
diff --git a/mm/util.c b/mm/util.c
index a8bf17f18a81..1104339ad2ca 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -635,6 +635,21 @@  void kvfree_sensitive(const void *addr, size_t len)
 }
 EXPORT_SYMBOL(kvfree_sensitive);
 
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+{
+	void *newp;
+
+	if (oldsize >= newsize)
+		return (void *)p;
+	newp = kvmalloc(newsize, flags);
+	if (!newp)
+		return NULL;
+	memcpy(newp, p, oldsize);
+	kvfree(p);
+	return newp;
+}
+EXPORT_SYMBOL(kvrealloc);
+
 static inline void *__page_rmapping(struct page *page)
 {
 	unsigned long mapping;