Message ID | MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com (mailing list archive) |
---|---|
State | New |
Headers | show |
Series | audit: io_uring openat triggers audit reference count underflow in worker thread | expand |
On 10/6/23 2:09 PM, Dan Clash wrote: > diff --git a/fs/namei.c b/fs/namei.c > index 2a8baa6ce3e8..4f7ac131c9d1 100644 > --- a/fs/namei.c > +++ b/fs/namei.c > @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty) > } > } > > - result->refcnt = 1; > + refcount_set(&result->refcnt, 1); > /* The empty path is special. */ > if (unlikely(!len)) { > if (empty) > @@ -248,7 +248,7 @@ getname_kernel(const char * filename) > memcpy((char *)result->name, filename, len); > result->uptr = NULL; > result->aname = NULL; > - result->refcnt = 1; > + refcount_set(&result->refcnt, 1); > audit_getname(result); > > return result; > @@ -259,9 +259,10 @@ void putname(struct filename *name) > if (IS_ERR(name)) > return; > > - BUG_ON(name->refcnt <= 0); > + BUG_ON(refcount_read(&name->refcnt) == 0); > + BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED); > > - if (--name->refcnt > 0) > + if (!refcount_dec_and_test(&name->refcnt)) > return; > > if (name->name != name->iname) { > diff --git a/include/linux/fs.h b/include/linux/fs.h > index d0a54e9aac7a..8217e07726d4 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2719,7 +2719,7 @@ struct audit_names; > struct filename { > const char *name; /* pointer to actual string */ > const __user char *uptr; /* original userland pointer */ > - int refcnt; > + refcount_t refcnt; > struct audit_names *aname; > const char iname[]; > }; > diff --git a/kernel/auditsc.c b/kernel/auditsc.c > index 37cded22497e..232e0be9f6d9 100644 > --- a/kernel/auditsc.c > +++ b/kernel/auditsc.c > @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr) > if (!n->name) > continue; > if (n->name->uptr == uptr) { > - n->name->refcnt++; > + refcount_inc(&n->name->refcnt); > return n->name; > } > } > @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name) > n->name = name; > n->name_len = AUDIT_NAME_FULL; > name->aname = n; > - name->refcnt++; > + refcount_inc(&name->refcnt); > } > > static inline int audit_copy_fcaps(struct audit_names *name, > @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, > return; > if (name) { > n->name = name; > - name->refcnt++; > + refcount_inc(&name->refcnt); > } > > out: > @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent, > if (found_parent) { > found_child->name = found_parent->name; > found_child->name_len = AUDIT_NAME_FULL; > - found_child->name->refcnt++; > + refcount_inc(&found_child->name->refcnt); > } > } I'm not fully aware of what audit is doing with struct filename outside of needing it for the audit log. Rather than impose the atomic references for everyone, would it be doable to simply dupe the struct instead of grabbing the (non-atomic) reference to the existing one? If not, since there's no over/underflow handling right now, it'd certainly be cheaper to use an atomic_t here rather than a full refcount.
On 10/6/23 8:32 PM, Jens Axboe wrote: > On 10/6/23 2:09 PM, Dan Clash wrote: >> diff --git a/fs/namei.c b/fs/namei.c >> index 2a8baa6ce3e8..4f7ac131c9d1 100644 >> --- a/fs/namei.c >> +++ b/fs/namei.c >> @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty) >> } >> } >> >> - result->refcnt = 1; >> + refcount_set(&result->refcnt, 1); >> /* The empty path is special. */ >> if (unlikely(!len)) { >> if (empty) >> @@ -248,7 +248,7 @@ getname_kernel(const char * filename) >> memcpy((char *)result->name, filename, len); >> result->uptr = NULL; >> result->aname = NULL; >> - result->refcnt = 1; >> + refcount_set(&result->refcnt, 1); >> audit_getname(result); >> >> return result; >> @@ -259,9 +259,10 @@ void putname(struct filename *name) >> if (IS_ERR(name)) >> return; >> >> - BUG_ON(name->refcnt <= 0); >> + BUG_ON(refcount_read(&name->refcnt) == 0); >> + BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED); >> >> - if (--name->refcnt > 0) >> + if (!refcount_dec_and_test(&name->refcnt)) >> return; >> >> if (name->name != name->iname) { >> diff --git a/include/linux/fs.h b/include/linux/fs.h >> index d0a54e9aac7a..8217e07726d4 100644 >> --- a/include/linux/fs.h >> +++ b/include/linux/fs.h >> @@ -2719,7 +2719,7 @@ struct audit_names; >> struct filename { >> const char *name; /* pointer to actual string */ >> const __user char *uptr; /* original userland pointer */ >> - int refcnt; >> + refcount_t refcnt; >> struct audit_names *aname; >> const char iname[]; >> }; >> diff --git a/kernel/auditsc.c b/kernel/auditsc.c >> index 37cded22497e..232e0be9f6d9 100644 >> --- a/kernel/auditsc.c >> +++ b/kernel/auditsc.c >> @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr) >> if (!n->name) >> continue; >> if (n->name->uptr == uptr) { >> - n->name->refcnt++; >> + refcount_inc(&n->name->refcnt); >> return n->name; >> } >> } >> @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name) >> n->name = name; >> n->name_len = AUDIT_NAME_FULL; >> name->aname = n; >> - name->refcnt++; >> + refcount_inc(&name->refcnt); >> } >> >> static inline int audit_copy_fcaps(struct audit_names *name, >> @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, >> return; >> if (name) { >> n->name = name; >> - name->refcnt++; >> + refcount_inc(&name->refcnt); >> } >> >> out: >> @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent, >> if (found_parent) { >> found_child->name = found_parent->name; >> found_child->name_len = AUDIT_NAME_FULL; >> - found_child->name->refcnt++; >> + refcount_inc(&found_child->name->refcnt); >> } >> } > > I'm not fully aware of what audit is doing with struct filename outside > of needing it for the audit log. Rather than impose the atomic > references for everyone, would it be doable to simply dupe the struct > instead of grabbing the (non-atomic) reference to the existing one? > > If not, since there's no over/underflow handling right now, it'd > certainly be cheaper to use an atomic_t here rather than a full > refcount. After taking a closer look at this, I think the best course of action would be to make the struct filename refcnt and atomic_t. With audit in the picture, it's quite possible to have multiple threads manipulating the filename refcnt at the same time, which is obviously not currently safe. Dan, would you mind sending that as a patch? Include a link to your original email: Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/ and a Fixes tag as well: Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring") and CC linux-fsdevel@vger.kernel.org and Christian Brauner <brauner@kernel.org> as well. Thanks!
On Sat, Oct 7, 2023 at 9:11 AM Jens Axboe <axboe@kernel.dk> wrote: > On 10/6/23 8:32 PM, Jens Axboe wrote: > > On 10/6/23 2:09 PM, Dan Clash wrote: ... > > I'm not fully aware of what audit is doing with struct filename outside > > of needing it for the audit log. Rather than impose the atomic > > references for everyone, would it be doable to simply dupe the struct > > instead of grabbing the (non-atomic) reference to the existing one? > > > > If not, since there's no over/underflow handling right now, it'd > > certainly be cheaper to use an atomic_t here rather than a full > > refcount. > > After taking a closer look at this, I think the best course of action > would be to make the struct filename refcnt and atomic_t. With audit in > the picture, it's quite possible to have multiple threads manipulating > the filename refcnt at the same time, which is obviously not currently > safe. Thanks Jens. I personally would feel a bit better with the additional safety provided by refount_t, but I agree that there is little chance of an overflow/underflow in this case so the additional refcount_t checking is not likely to be needed here. For the record, this should only be an issue when audit is combined with io_uring, prior to io_uring there wasn't an issue with multiple threads attempting to cleanup the filename objects so we didn't have to worry about racing on filename::refcnt updates. However, for those systems where both audit and io_uring are in use we definitely have a problem and will need the upcoming fix from Dan to ensure the safety of the system. Thanks for spotting this Dan and doing the initial investigation into the problem, if you run into any problems with the patch or need a hand let me know, I'm happy to help. > Dan, would you mind sending that as a patch? Include a link to your > original email: > > Link: https://lore.kernel.org/lkml/MW2PR2101MB1033FFF044A258F84AEAA584F1C9A@MW2PR2101MB1033.namprd21.prod.outlook.com/ > > and a Fixes tag as well: > > Fixes: 5bd2182d58e9 ("audit,io_uring,io-wq: add some basic audit support to io_uring") > > and CC linux-fsdevel@vger.kernel.org and > Christian Brauner <brauner@kernel.org> as well. I'm going to CC Christian on this email just so he has a heads-up about the problem and knows to expect a patch. -- paul-moore.com
I retested with the following change as a sanity check: - BUG_ON(name->refcnt <= 0); + BUG_ON(atomic_read(&name->refcnt) <= 0); checkpatch.pl suggests using WARN_ON_ONCE rather than BUG. devvm ~ $ ~/linux/scripts/checkpatch.pl --patch ~/io_uring_audit_hang_atomic.patch WARNING: Do not crash the kernel unless it is absolutely unavoidable --use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants #28: FILE: fs/namei.c:262: + BUG_ON(atomic_read(&name->refcnt) <= 0); ... refcount_t uses WARN_ON_ONCE. I can think of three choices: 1. Use atomic_t and remove the BUG line. 2. Use refcount_t and remove the BUG line. 3. Use atomic_t and partially implement the warn behavior of refcount_t. Choice 1 and 2 seem better than choice 3.
On 10/8/23 8:38 PM, Dan Clash wrote: > I retested with the following change as a sanity check: > > - BUG_ON(name->refcnt <= 0); > + BUG_ON(atomic_read(&name->refcnt) <= 0); > > checkpatch.pl suggests using WARN_ON_ONCE rather than BUG. > > devvm ~ $ ~/linux/scripts/checkpatch.pl --patch ~/io_uring_audit_hang_atomic.patch > WARNING: Do not crash the kernel unless it is absolutely unavoidable > --use WARN_ON_ONCE() plus recovery code (if feasible) instead of BUG() or variants > #28: FILE: fs/namei.c:262: > + BUG_ON(atomic_read(&name->refcnt) <= 0); > ... > > refcount_t uses WARN_ON_ONCE. > > I can think of three choices: > > 1. Use atomic_t and remove the BUG line. > 2. Use refcount_t and remove the BUG line. > 3. Use atomic_t and partially implement the warn behavior of refcount_t. > > Choice 1 and 2 seem better than choice 3. I'd probably just make it: if (WARN_ON_ONCE(!atomic_read(name->refcnt))) return; to make it a straightforward conversion.
The patch will be sent from my alternate email address daclash@linux.microsoft.com. Paul and Jens, thank you for your guidance so far. Dan
diff --git a/fs/namei.c b/fs/namei.c index 2a8baa6ce3e8..4f7ac131c9d1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -187,7 +187,7 @@ getname_flags(const char __user *filename, int flags, int *empty) } } - result->refcnt = 1; + refcount_set(&result->refcnt, 1); /* The empty path is special. */ if (unlikely(!len)) { if (empty) @@ -248,7 +248,7 @@ getname_kernel(const char * filename) memcpy((char *)result->name, filename, len); result->uptr = NULL; result->aname = NULL; - result->refcnt = 1; + refcount_set(&result->refcnt, 1); audit_getname(result); return result; @@ -259,9 +259,10 @@ void putname(struct filename *name) if (IS_ERR(name)) return; - BUG_ON(name->refcnt <= 0); + BUG_ON(refcount_read(&name->refcnt) == 0); + BUG_ON(refcount_read(&name->refcnt) == REFCOUNT_SATURATED); - if (--name->refcnt > 0) + if (!refcount_dec_and_test(&name->refcnt)) return; if (name->name != name->iname) { diff --git a/include/linux/fs.h b/include/linux/fs.h index d0a54e9aac7a..8217e07726d4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2719,7 +2719,7 @@ struct audit_names; struct filename { const char *name; /* pointer to actual string */ const __user char *uptr; /* original userland pointer */ - int refcnt; + refcount_t refcnt; struct audit_names *aname; const char iname[]; }; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 37cded22497e..232e0be9f6d9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2188,7 +2188,7 @@ __audit_reusename(const __user char *uptr) if (!n->name) continue; if (n->name->uptr == uptr) { - n->name->refcnt++; + refcount_inc(&n->name->refcnt); return n->name; } } @@ -2217,7 +2217,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - name->refcnt++; + refcount_inc(&name->refcnt); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2349,7 +2349,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, return; if (name) { n->name = name; - name->refcnt++; + refcount_inc(&name->refcnt); } out: @@ -2474,7 +2474,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - found_child->name->refcnt++; + refcount_inc(&found_child->name->refcnt); } }
This discussion is about how to fix an audit reference count decrement race between two io_uring threads. Original discussion link: https : / / github . com / axboe / liburing / issues / 958 Details: The test program below hangs indefinitely waiting for an openat cqe. The reproduction is with a distro kernel Ubuntu-azure-6.2-6.2.0-1012.12_22.04.1. However, the bug seems possible with an upstream kernel. An experiment of changing the reference count in struct filename from int to refcount_t allows the test program to complete. The bug did not occur with this test program until a kernel containing commit 5bd2182d58e9 was used. I have not found a matching reported issue or upstream commit yet. The dmseg log shows an audit related path: [27883.992550] kernel BUG at fs/namei.c:262! [27883.994051] invalid opcode: 0000 [#15] SMP PTI [27883.995719] CPU: 3 PID: 84988 Comm: iou-wrk-84835 Tainted: G D 6.2.0-1012-azure #12~22.04.1-Ubuntu [27883.999064] Hardware name: Microsoft Corporation Virtual Machine/Virtual Machine, BIOS Hyper-V UEFI Release v4.1 05/09/2022 [27884.002734] RIP: 0010:putname+0x68/0x70 ... [27884.032893] Call Trace: [27884.034032] <TASK> [27884.035117] ? show_regs+0x6a/0x80 [27884.036763] ? die+0x38/0xa0 [27884.038023] ? do_trap+0xd0/0xf0 [27884.039359] ? do_error_trap+0x70/0x90 [27884.040861] ? putname+0x68/0x70 [27884.042201] ? exc_invalid_op+0x53/0x70 [27884.043698] ? putname+0x68/0x70 [27884.045076] ? asm_exc_invalid_op+0x1b/0x20 [27884.047051] ? putname+0x68/0x70 [27884.048415] audit_reset_context.part.0.constprop.0+0xe1/0x300 [27884.050511] __audit_uring_exit+0xda/0x1c0 [27884.052100] io_issue_sqe+0x1f3/0x450 [27884.053702] ? lock_timer_base+0x3b/0xd0 [27884.055283] io_wq_submit_work+0x8d/0x2b0 [27884.056848] ? __try_to_del_timer_sync+0x67/0xa0 [27884.058577] io_worker_handle_work+0x17c/0x2b0 [27884.060267] io_wqe_worker+0x10a/0x350 [27884.061714] ? raw_spin_rq_unlock+0x10/0x30 [27884.063295] ? finish_task_switch.isra.0+0x8b/0x2c0 [27884.065537] ? __pfx_io_wqe_worker+0x10/0x10 [27884.067215] ret_from_fork+0x2c/0x50 [27884.068733] RIP: 0033:0x0 ... Test program usage: ./io_uring_open_close_audit_hang --directory /tmp/deleteme --count 10000 Test program source: // Note: The test program is C++ but could be converted to C. #include <cassert> #include <fcntl.h> #include <filesystem> #include <getopt.h> #include <iostream> #include <liburing.h> // open and close a file. the file is created if it does not exist. void openClose(struct io_uring& ring, std::string fileName) { int ret; struct io_uring_cqe* cqe {}; struct io_uring_sqe* sqe {}; int fd {}; int flags {O_RDWR | O_CREAT}; mode_t mode {0666}; // openat2 sqe = io_uring_get_sqe(&ring); assert(sqe != nullptr); io_uring_prep_openat(sqe, AT_FDCWD, fileName.data(), flags, mode); io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); ret = io_uring_submit(&ring); assert(ret == 1); ret = io_uring_wait_cqe(&ring, &cqe); assert(ret == 0); fd = cqe->res; assert(fd > 0); io_uring_cqe_seen(&ring, cqe); // close sqe = io_uring_get_sqe(&ring); assert(sqe != nullptr); io_uring_prep_close(sqe, fd); io_uring_sqe_set_flags(sqe, IOSQE_ASYNC); ret = io_uring_submit(&ring); assert(ret == 1); // wait for the close to complete. ret = io_uring_wait_cqe(&ring, &cqe); assert(ret == 0); // verify that close succeeded. assert(cqe->res == 0); io_uring_cqe_seen(&ring, cqe); } // create 100 files and then open each file twice. void openCloseHang(std::string filePath) { int ret; struct io_uring ring; ret = io_uring_queue_init(8, &ring, 0); assert(0 == ret); int repeat {3}; int numFiles {100}; std::filesystem::create_directory(filePath); // files of length 0 are created in the j==0 iteration below. // those files are opened and closed during the j>0 iteraions. // a repeat of 3 results in a fairly reliable reproduction. for (int j = 0; j < repeat; j += 1) { for (int i = 0; i < numFiles; i += 1) { std::string fileName(filePath + "/file" + std::to_string(i)); openClose(ring, fileName); } } std::filesystem::remove_all(filePath); io_uring_queue_exit(&ring); } int main(int argc, char** argv) { std::string filePath {}; int iterations {}; struct option options[] { {"help", no_argument, 0, 'h'}, {"directory", required_argument, 0, 'd'}, {"count", required_argument, 0, 'c'}, { 0, 0, 0, 0 } }; bool printUsage {false}; int val {}; while ((val = getopt_long_only(argc, argv, "", options, nullptr)) != -1) { if (val == 'h') { printUsage = true; } else if (val == 'd') { filePath = optarg; if (std::filesystem::exists(filePath)) { printUsage = true; std::cerr << "directory must not exist" << std::endl; } } else if (val == 'c') { iterations = atoi(optarg); if (0 == iterations) { printUsage = true; } } else { printUsage = true; } } if ((0 == iterations) || (filePath.empty())) { printUsage = true; } if (printUsage || (optind < argc)) { std::cerr << "io_uring_open_close_audit_hang.cc --directory DIR --count COUNT" << std::endl; exit(1); } for (int i = 0; i < iterations; i += 1) { if (0 == (i % 100)) { std::cout << "i=" << std::to_string(i) << std::endl; } openCloseHang(filePath); } return 0; } Changing the reference count from int to refcount_t allows the test program to complete using the v6.2 distro kernel. The patch applies and builds on the upstream v6.1.55 kernel. Signed-off-by: Dan Clash <dan.clash@microsoft.com> ---