diff mbox series

[v2] netfs: fix kernel BUG in iov_iter_revert()

Message ID 20241202093943.227786-1-dmantipov@yandex.ru (mailing list archive)
State New
Headers show
Series [v2] netfs: fix kernel BUG in iov_iter_revert() | expand

Commit Message

Dmitry Antipov Dec. 2, 2024, 9:39 a.m. UTC
Syzbot has reported the following splat triggered by fault injection:

kernel BUG at lib/iov_iter.c:624!
Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
CPU: 0 UID: 0 PID: 5767 Comm: repro Not tainted 6.12.0-rc4-syzkaller-00261-g850925a8133c #0
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014
RIP: 0010:iov_iter_revert+0x420/0x590
Code: 42 80 3c 20 00 48 8b 1c 24 74 08 48 89 df e8 17 07 43 fd 4c 89 2b e9 04 01 00 00 45 85 ed 48 8b 3c 24 75 16 e8 41 48 d9 fc 90 <0f> 0b 41 83 fd 05 48 8b 3c 24 0f 84 58 01 00 00 48 89 f8 48 c1 e8
RSP: 0018:ffffc90002f4f740 EFLAGS: 00010293
RAX: ffffffff84bba22f RBX: 000000000001e098 RCX: ffff888026309cc0
RDX: 0000000000000000 RSI: ffffffff8f098180 RDI: ffff888024f92df0
RBP: 0000000000000000 R08: 0000000000000001 R09: ffffffff84bb9f14
R10: 0000000000000004 R11: ffff888026309cc0 R12: dffffc0000000000
R13: 0000000000000000 R14: ffff888024f92de0 R15: fffffffffffe1f68
FS:  00007f2c11757600(0000) GS:ffff888062800000(0000) knlGS:0000000000000000
CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
CR2: 0000557cafd1eb48 CR3: 0000000024e1a000 CR4: 00000000000006f0
Call Trace:
 <TASK>
 ? __die_body+0x5f/0xb0
 ? die+0x9e/0xc0
 ? do_trap+0x15a/0x3a0
 ? iov_iter_revert+0x420/0x590
 ? do_error_trap+0x1dc/0x2c0
 ? iov_iter_revert+0x420/0x590
 ? __pfx_do_error_trap+0x10/0x10
 ? handle_invalid_op+0x34/0x40
 ? iov_iter_revert+0x420/0x590
 ? exc_invalid_op+0x38/0x50
 ? asm_exc_invalid_op+0x1a/0x20
 ? iov_iter_revert+0x104/0x590
 ? iov_iter_revert+0x41f/0x590
 ? iov_iter_revert+0x420/0x590
 netfs_reset_iter+0xce/0x130
 netfs_read_subreq_terminated+0x1fe/0xad0
 netfs_read_to_pagecache+0x628/0x900
 netfs_readahead+0x7e9/0x9d0
 ? __pfx_netfs_readahead+0x10/0x10
 ? blk_start_plug+0x70/0x1b0
 read_pages+0x180/0x840
 ? __pfx_read_pages+0x10/0x10
 ? filemap_add_folio+0x26d/0x650
 ? __pfx_filemap_add_folio+0x10/0x10
 ? rcu_read_lock_any_held+0xb7/0x160
 ? __pfx_rcu_read_lock_any_held+0x10/0x10
 ? __pfx_proc_fail_nth_write+0x10/0x10
 page_cache_ra_unbounded+0x774/0x8a0
 force_page_cache_ra+0x280/0x2f0
 generic_fadvise+0x522/0x830
 ? __pfx_generic_fadvise+0x10/0x10
 ? lockdep_hardirqs_on_prepare+0x43d/0x780
 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10
 ? vfs_fadvise+0x99/0xc0
 __x64_sys_readahead+0x1ac/0x230
 do_syscall_64+0xf3/0x230
 entry_SYSCALL_64_after_hwframe+0x77/0x7f
RIP: 0033:0x7f2c116736a9
Code: 5c c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 37 0d 00 f7 d8 64 89 01 48
RSP: 002b:00007ffc78181c88 EFLAGS: 00000246 ORIG_RAX: 00000000000000bb
RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f2c116736a9
RDX: 000800000000000d RSI: 0000000000000005 RDI: 0000000000000006
RBP: 00000000004029d8 R08: 00007ffc78181658 R09: 00007f0038363735
R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffc78181cb0
R13: 00007ffc78181ec8 R14: 0000000000401120 R15: 00007f2c1178ca80
 </TASK>

This happens just because 'netfs_prepare_read_iterator()' may return
-ENOMEM (which is actually triggered by the fault injection) but such
a cases are not consistently handled in 'netfs_read_to_pagecache()'.
So wrap 'netfs_prepare_read_iterator()' to handle all possible
-ENOMEM cases and mark the corresponding subrequest as cancelled.

Reported-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c
Fixes: ee4cdf7ba857 ("netfs: Speed up buffered reading")
Tested-by: syzbot+404b4b745080b6210c6c@syzkaller.appspotmail.com
Signed-off-by: Dmitry Antipov <dmantipov@yandex.ru>
---
v2: adjusted to match 6.13.0-rc1
---
 fs/netfs/buffered_read.c | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

Comments

David Howells Dec. 4, 2024, 9:56 a.m. UTC | #1
Interesting...  The test program also causes fuse to oops (see attached) over
without even getting to netfslib.  The BUG is in iov_iter_revert():

	if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
		BUG(); /* We should never go beyond the start of the specified
			* range since we might then be straying into pages that
			* aren't pinned.
			*/

I was trying the C reproducer from here:

	https://syzkaller.appspot.com/bug?extid=404b4b745080b6210c6c

David
---
 FAULT_INJECTION: forcing a failure.
 name failslab, interval 1, probability 0, space 0, times 1
 CPU: 3 UID: 0 PID: 5926 Comm: repro-12bc200f9 Not tainted 6.13.0-rc1-build3+ #5189
 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
 Call Trace:
  <TASK>
  dump_stack_lvl+0x47/0x70
  should_fail_ex+0x12e/0x160
  should_failslab+0x50/0x60
  __kmalloc_noprof+0x9f/0x290
  ? fuse_get_user_pages+0xad/0x290
  ? kmem_cache_debug_flags+0xc/0x20
  fuse_get_user_pages+0xad/0x290
  ? fuse_folios_alloc+0x13/0x30
  fuse_direct_io+0x220/0x460
  fuse_direct_write_iter+0x10f/0x180
  vfs_write+0x142/0x1e0
  ? __pfx_fuse_file_write_iter+0x10/0x10
  ksys_write+0x6d/0xc0
  do_syscall_64+0x80/0xe0
  entry_SYSCALL_64_after_hwframe+0x71/0x79
 RIP: 0033:0x7fdd71f0f85d
 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d a3 45 0c 00 f7 d8 64 89 01 48
 RSP: 002b:00007fdd71df7e68 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
 RAX: ffffffffffffffda RBX: 00007fdd71df8cdc RCX: 00007fdd71f0f85d
 RDX: 000000000000000f RSI: 0000000020000280 RDI: 0000000000000005
 RBP: 00007fdd71df7e90 R08: 0000000000000000 R09: 0000000000000000
 R10: 0000000000000000 R11: 0000000000000246 R12: 00007fdd71df86c0
 R13: ffffffffffffff88 R14: 0000000000000002 R15: 00007fffe6090bd0
  </TASK>
 ------------[ cut here ]------------
 kernel BUG at lib/iov_iter.c:626!
 Oops: invalid opcode: 0000 [#1] SMP PTI
 CPU: 3 UID: 0 PID: 5926 Comm: repro-12bc200f9 Not tainted 6.13.0-rc1-build3+ #5189
 Hardware name: ASUS All Series/H97-PLUS, BIOS 2306 10/09/2014
 RIP: 0010:iov_iter_revert+0x4c/0xf0
 Code: 80 f9 06 0f 84 bf 00 00 00 48 8b 47 08 48 39 f0 72 0c 48 29 f0 48 89 47 08 c3 cc cc cc cc 48 29 c6 80 f9 05 74 04 84 c9 75 02 <0f> 0b 80 f9 02 48 8b 47 10 75 15 8b 48 f8 48 83 e8 10 48 ff 42 20
 RSP: 0018:ffff88811f16fd00 EFLAGS: 00010246
 RAX: 0000000000000000 RBX: ffff888107504900 RCX: 0000000000001600
 RDX: ffff88811f16fe60 RSI: 000000000000000f RDI: ffff88811f16fe60
 RBP: 0000000000000000 R08: ffff88840fb9fa40 R09: ffff88840fb9fa60
 R10: 0000000000000006 R11: 0000000000000020 R12: ffff88810b481400
 R13: 00000000fffffff4 R14: ffff88811ce26cc0 R15: ffff88811f16fdb8
 FS:  00007fdd71df86c0(0000) GS:ffff88840fb80000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: 0000560d9a217710 CR3: 000000010760e003 CR4: 00000000001706f0
 Call Trace:
  <TASK>
  ? __die_body+0x1a/0x60
  ? die+0x30/0x50
  ? do_trap+0x7a/0x100
  ? iov_iter_revert+0x4c/0xf0
  ? iov_iter_revert+0x4c/0xf0
  ? do_error_trap+0x6e/0xa0
  ? iov_iter_revert+0x4c/0xf0
  ? exc_invalid_op+0x49/0x60
  ? iov_iter_revert+0x4c/0xf0
  ? asm_exc_invalid_op+0x16/0x20
  ? iov_iter_revert+0x4c/0xf0
  fuse_direct_io+0x398/0x460
  fuse_direct_write_iter+0x10f/0x180
  vfs_write+0x142/0x1e0
  ? __pfx_fuse_file_write_iter+0x10/0x10
  ksys_write+0x6d/0xc0
  do_syscall_64+0x80/0xe0
  entry_SYSCALL_64_after_hwframe+0x71/0x79
Miklos Szeredi Dec. 4, 2024, 12:41 p.m. UTC | #2
On Wed, 4 Dec 2024 at 10:56, David Howells <dhowells@redhat.com> wrote:
>
> Interesting...  The test program also causes fuse to oops (see attached) over
> without even getting to netfslib.  The BUG is in iov_iter_revert():
>
>         if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
>                 BUG(); /* We should never go beyond the start of the specified
>                         * range since we might then be straying into pages that
>                         * aren't pinned.
>                         */

Can you please test this?

--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1541,8 +1541,10 @@ static int fuse_get_user_pages(struct
fuse_args_pages *ap, struct iov_iter *ii,
         */
        struct page **pages = kzalloc(max_pages * sizeof(struct page *),
                                      GFP_KERNEL);
-       if (!pages)
+       if (!pages) {
+               *nbytesp = 0;
                return -ENOMEM;
+       }

        while (nbytes < *nbytesp && nr_pages < max_pages) {
                unsigned nfolios, i;

(Also attaching patch without whitespace damage.)

Thanks,
Miklos
Dmitry Antipov Dec. 4, 2024, 1:13 p.m. UTC | #3
On 12/4/24 3:41 PM, Miklos Szeredi wrote:

> Can you please test this?

Tested-by: Dmitry Antipov <dmantipov@yandex.ru>

Dmitry
David Howells Dec. 4, 2024, 1:41 p.m. UTC | #4
I seems to work, so you can add:

Tested-by: David Howells <dhowells@redhat.com>

Please don't mark it as closing the syzbot report as that pertains to
netfslib, not fuse.  syzbot attached both reproducers to the same overall
report when it really ought to have split them.

Note that a reproducer zombie is left behind and the filesystem can't be
unmounted as it is busy (which may be due to the zombie).  However, the zombie
goes away and is replaced with another if the reproducer is run again.  I
wonder if I'm seeing a systemd bug as systemd should be reaping the zombies.

David
David Howells Dec. 4, 2024, 2:03 p.m. UTC | #5
Hi Dmitry,

> So wrap 'netfs_prepare_read_iterator()' to handle all possible
> -ENOMEM cases and mark the corresponding subrequest as cancelled.

I think there's a slightly better way to do it.  There are three users of
netfs_prepare_read_iterator() but there's also the call to ->prepare_read()
which I think can all be combined.

(And I've just realised that the patch I asked syzbot to test has a bug in it)

David
Bernd Schubert Dec. 6, 2024, 12:41 p.m. UTC | #6
On 12/4/24 13:41, Miklos Szeredi wrote:
> On Wed, 4 Dec 2024 at 10:56, David Howells <dhowells@redhat.com> wrote:
>>
>> Interesting...  The test program also causes fuse to oops (see attached) over
>> without even getting to netfslib.  The BUG is in iov_iter_revert():
>>
>>         if (iov_iter_is_xarray(i) || iter_is_ubuf(i)) {
>>                 BUG(); /* We should never go beyond the start of the specified
>>                         * range since we might then be straying into pages that
>>                         * aren't pinned.
>>                         */
> 
> Can you please test this?
> 
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1541,8 +1541,10 @@ static int fuse_get_user_pages(struct
> fuse_args_pages *ap, struct iov_iter *ii,
>          */
>         struct page **pages = kzalloc(max_pages * sizeof(struct page *),
>                                       GFP_KERNEL);
> -       if (!pages)
> +       if (!pages) {
> +               *nbytesp = 0;
>                 return -ENOMEM;
> +       }
> 
>         while (nbytes < *nbytesp && nr_pages < max_pages) {
>                 unsigned nfolios, i;
> 
> (Also attaching patch without whitespace damage.)

I had already posted a patch on Monday.

https://lore.kernel.org/r/20241203-fix-fuse_get_user_pages-v2-1-acce8a29d06b@ddn.com

@David, is that the same sysbot report or another one?


Thanks,
Bernd
Miklos Szeredi Dec. 13, 2024, 4:13 p.m. UTC | #7
On Fri, 6 Dec 2024 at 13:41, Bernd Schubert <bernd.schubert@fastmail.fm> wrote:

> I had already posted a patch on Monday.
>
> https://lore.kernel.org/r/20241203-fix-fuse_get_user_pages-v2-1-acce8a29d06b@ddn.com

Sorry, missed that.  Applied your version with the above test-by's added.

> @David, is that the same sysbot report or another one?

It's a different one, assigned to netfs, not fuse.

Thanks,
Miklos
diff mbox series

Patch

diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c
index 7ac34550c403..4404f3c6ec3e 100644
--- a/fs/netfs/buffered_read.c
+++ b/fs/netfs/buffered_read.c
@@ -95,7 +95,7 @@  static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
 }
 
 /*
- * netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
+ * __netfs_prepare_read_iterator - Prepare the subreq iterator for I/O
  * @subreq: The subrequest to be set up
  *
  * Prepare the I/O iterator representing the read buffer on a subrequest for
@@ -109,7 +109,7 @@  static size_t netfs_load_buffer_from_ra(struct netfs_io_request *rreq,
  * [!] NOTE: This must be run in the same thread as ->issue_read() was called
  * in as we access the readahead_control struct.
  */
-static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+static ssize_t __netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 {
 	struct netfs_io_request *rreq = subreq->rreq;
 	size_t rsize = subreq->len;
@@ -174,6 +174,21 @@  static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
 	return subreq->len;
 }
 
+/* Wrap the above by handling possible -ENOMEM and
+ * marking the corresponding subrequest as cancelled.
+ */
+static inline ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq)
+{
+	struct netfs_io_request *rreq = subreq->rreq;
+	ssize_t slice = __netfs_prepare_read_iterator(subreq);
+
+	if (unlikely(slice < 0)) {
+		atomic_dec(&rreq->nr_outstanding);
+		netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+	}
+	return slice;
+}
+
 static enum netfs_io_source netfs_cache_prepare_read(struct netfs_io_request *rreq,
 						     struct netfs_io_subrequest *subreq,
 						     loff_t i_size)
@@ -285,9 +300,7 @@  static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			}
 
 			slice = netfs_prepare_read_iterator(subreq);
-			if (slice < 0) {
-				atomic_dec(&rreq->nr_outstanding);
-				netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel);
+			if (unlikely(slice < 0)) {
 				ret = slice;
 				break;
 			}
@@ -302,6 +315,10 @@  static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			netfs_stat(&netfs_n_rh_zero);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (unlikely(slice < 0)) {
+				ret = slice;
+				break;
+			}
 			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
 			netfs_read_subreq_terminated(subreq, 0, false);
 			goto done;
@@ -310,6 +327,10 @@  static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
 		if (source == NETFS_READ_FROM_CACHE) {
 			trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
 			slice = netfs_prepare_read_iterator(subreq);
+			if (unlikely(slice < 0)) {
+				ret = slice;
+				break;
+			}
 			netfs_read_cache_to_pagecache(rreq, subreq);
 			goto done;
 		}