Message ID | 20200622192900.22757-4-minchan@kernel.org (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | introduce memory hinting API for external process | expand |
On Mon, 22 Jun 2020, Minchan Kim wrote: > diff --git a/mm/madvise.c b/mm/madvise.c > index 551ed816eefe..23abca3f93fa 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -17,6 +17,7 @@ > #include <linux/falloc.h> > #include <linux/fadvise.h> > #include <linux/sched.h> > +#include <linux/sched/mm.h> > #include <linux/ksm.h> > #include <linux/fs.h> > #include <linux/file.h> > @@ -995,6 +996,18 @@ madvise_behavior_valid(int behavior) > } > } > > +static bool > +process_madvise_behavior_valid(int behavior) > +{ > + switch (behavior) { > + case MADV_COLD: > + case MADV_PAGEOUT: > + return true; > + default: > + return false; > + } > +} > + > /* > * The madvise(2) system call. > * > @@ -1042,6 +1055,11 @@ madvise_behavior_valid(int behavior) > * MADV_DONTDUMP - the application wants to prevent pages in the given range > * from being included in its core dump. > * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. > + * MADV_COLD - the application is not expected to use this memory soon, > + * deactivate pages in this range so that they can be reclaimed > + * easily if memory pressure hanppens. > + * MADV_PAGEOUT - the application is not expected to use this memory soon, > + * page out the pages in this range immediately. > * > * return values: > * zero - success > @@ -1176,3 +1194,106 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > { > return do_madvise(current, current->mm, start, len_in, behavior); > } > + > +static int process_madvise_vec(struct task_struct *target_task, > + struct mm_struct *mm, struct iov_iter *iter, int behavior) > +{ > + struct iovec iovec; > + int ret = 0; > + > + while (iov_iter_count(iter)) { > + iovec = iov_iter_iovec(iter); > + ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base, > + iovec.iov_len, behavior); > + if (ret < 0) > + break; > + iov_iter_advance(iter, iovec.iov_len); > + } > + > + return ret; > +} > + > +static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, > + int behavior, unsigned int flags) > +{ > + ssize_t ret; > + struct pid *pid; > + struct task_struct *task; > + struct mm_struct *mm; > + size_t total_len = iov_iter_count(iter); > + > + if (flags != 0) > + return -EINVAL; > + > + pid = pidfd_get_pid(pidfd); > + if (IS_ERR(pid)) > + return PTR_ERR(pid); > + > + task = get_pid_task(pid, PIDTYPE_PID); > + if (!task) { > + ret = -ESRCH; > + goto put_pid; > + } > + > + if (task->mm != current->mm && > + !process_madvise_behavior_valid(behavior)) { > + ret = -EINVAL; > + goto release_task; > + } > + > + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > + if (IS_ERR_OR_NULL(mm)) { > + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > + goto release_task; > + } > mm is always task->mm right? I'm wondering if it would be better to find the mm directly in process_madvise_vec() rather than passing it into the function. I'm not sure why we'd pass both task and mm here. + > + ret = process_madvise_vec(task, mm, iter, behavior); > + if (ret >= 0) > + ret = total_len - iov_iter_count(iter); > + > + mmput(mm); > +release_task: > + put_task_struct(task); > +put_pid: > + put_pid(pid); > + return ret; > +} > + > +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, > + unsigned long, vlen, int, behavior, unsigned int, flags) I love the idea of adding the flags parameter here and I can think of an immediate use case for MADV_HUGEPAGE, which is overloaded. Today, MADV_HUGEPAGE controls enablement depending on system config and controls defrag behavior based on system config. It also cannot be opted out of without setting MADV_NOHUGEPAGE :) I was thinking of a flag that users could use to trigger an immediate collapse in process context regardless of the system config. So I'm a big advocate of this flags parameter and consider it an absolute must for the API. Acked-by: David Rientjes <rientjes@google.com>
On Wed, Jun 24, 2020 at 01:00:14PM -0700, David Rientjes wrote: > On Mon, 22 Jun 2020, Minchan Kim wrote: > > > diff --git a/mm/madvise.c b/mm/madvise.c > > index 551ed816eefe..23abca3f93fa 100644 > > --- a/mm/madvise.c > > +++ b/mm/madvise.c > > @@ -17,6 +17,7 @@ > > #include <linux/falloc.h> > > #include <linux/fadvise.h> > > #include <linux/sched.h> > > +#include <linux/sched/mm.h> > > #include <linux/ksm.h> > > #include <linux/fs.h> > > #include <linux/file.h> > > @@ -995,6 +996,18 @@ madvise_behavior_valid(int behavior) > > } > > } > > > > +static bool > > +process_madvise_behavior_valid(int behavior) > > +{ > > + switch (behavior) { > > + case MADV_COLD: > > + case MADV_PAGEOUT: > > + return true; > > + default: > > + return false; > > + } > > +} > > + > > /* > > * The madvise(2) system call. > > * > > @@ -1042,6 +1055,11 @@ madvise_behavior_valid(int behavior) > > * MADV_DONTDUMP - the application wants to prevent pages in the given range > > * from being included in its core dump. > > * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. > > + * MADV_COLD - the application is not expected to use this memory soon, > > + * deactivate pages in this range so that they can be reclaimed > > + * easily if memory pressure hanppens. > > + * MADV_PAGEOUT - the application is not expected to use this memory soon, > > + * page out the pages in this range immediately. > > * > > * return values: > > * zero - success > > @@ -1176,3 +1194,106 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) > > { > > return do_madvise(current, current->mm, start, len_in, behavior); > > } > > + > > +static int process_madvise_vec(struct task_struct *target_task, > > + struct mm_struct *mm, struct iov_iter *iter, int behavior) > > +{ > > + struct iovec iovec; > > + int ret = 0; > > + > > + while (iov_iter_count(iter)) { > > + iovec = iov_iter_iovec(iter); > > + ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base, > > + iovec.iov_len, behavior); > > + if (ret < 0) > > + break; > > + iov_iter_advance(iter, iovec.iov_len); > > + } > > + > > + return ret; > > +} > > + > > +static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, > > + int behavior, unsigned int flags) > > +{ > > + ssize_t ret; > > + struct pid *pid; > > + struct task_struct *task; > > + struct mm_struct *mm; > > + size_t total_len = iov_iter_count(iter); > > + > > + if (flags != 0) > > + return -EINVAL; > > + > > + pid = pidfd_get_pid(pidfd); > > + if (IS_ERR(pid)) > > + return PTR_ERR(pid); > > + > > + task = get_pid_task(pid, PIDTYPE_PID); > > + if (!task) { > > + ret = -ESRCH; > > + goto put_pid; > > + } > > + > > + if (task->mm != current->mm && > > + !process_madvise_behavior_valid(behavior)) { > > + ret = -EINVAL; > > + goto release_task; > > + } > > + > > + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > > + if (IS_ERR_OR_NULL(mm)) { > > + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > > + goto release_task; > > + } > > > > mm is always task->mm right? I'm wondering if it would be better to find > the mm directly in process_madvise_vec() rather than passing it into the > function. I'm not sure why we'd pass both task and mm here. That's because of hint Jann provided in the past version. https://lore.kernel.org/linux-api/CAG48ez27=pwm5m_N_988xT1huO7g7h6arTQL44zev6TD-h-7Tg@mail.gmail.com/ Thanks for the review, David.
On Mon, Jun 22, 2020 at 9:29 PM Minchan Kim <minchan@kernel.org> wrote: > So finally, the API is as follows, > > ssize_t process_madvise(int pidfd, const struct iovec *iovec, > unsigned long vlen, int advice, unsigned int flags); I had not followed the discussion earlier and only now came across the syscall in linux-next, sorry for stirring things up this late. > diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > index 94bf4958d114..8f959d90338a 100644 > --- a/arch/x86/entry/syscalls/syscall_64.tbl > +++ b/arch/x86/entry/syscalls/syscall_64.tbl > @@ -364,6 +364,7 @@ > 440 common watch_mount sys_watch_mount > 441 common watch_sb sys_watch_sb > 442 common fsinfo sys_fsinfo > +443 64 process_madvise sys_process_madvise > > # > # x32-specific system call numbers start at 512 to avoid cache impact > @@ -407,3 +408,4 @@ > 545 x32 execveat compat_sys_execveat > 546 x32 preadv2 compat_sys_preadv64v2 > 547 x32 pwritev2 compat_sys_pwritev64v2 > +548 x32 process_madvise compat_sys_process_madvise I think we should not add any new x32-specific syscalls. Instead I think the compat_sys_process_madvise/sys_process_madvise can be merged into one. > + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > + if (IS_ERR_OR_NULL(mm)) { > + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > + goto release_task; > + } Minor point: Having to use IS_ERR_OR_NULL() tends to be fragile, and I would try to avoid that. Can mm_access() be changed to itself return PTR_ERR(-ESRCH) instead of NULL to improve its calling conventions? I see there are only three other callers. > + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); > + if (ret >= 0) { > + ret = do_process_madvise(pidfd, &iter, behavior, flags); > + kfree(iov); > + } > + return ret; > +} > + > +#ifdef CONFIG_COMPAT ... > + > + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), > + &iov, &iter); > + if (ret >= 0) { > + ret = do_process_madvise(pidfd, &iter, behavior, flags); > + kfree(iov); > + } Every syscall that passes an iovec seems to do this. If we make import_iovec() handle both cases directly, this syscall and a number of others can be simplified, and you avoid the x32 entry point I mentioned above Something like (untested) index dad8d0cfaaf7..0de4ddff24c1 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct iovec __user * uvector, { ssize_t n; struct iovec *p; - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, - *iov, &p); + + if (in_compat_syscall()) + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, + fast_segs, *iov, &p); + else + n = rw_copy_check_uvector(type, uvector, nr_segs, + fast_segs, *iov, &p); if (n < 0) { if (p != *iov) kfree(p); Arnd
On 8/28/20 11:40 AM, Arnd Bergmann wrote: > On Mon, Jun 22, 2020 at 9:29 PM Minchan Kim <minchan@kernel.org> wrote: >> So finally, the API is as follows, >> >> ssize_t process_madvise(int pidfd, const struct iovec *iovec, >> unsigned long vlen, int advice, unsigned int flags); > > I had not followed the discussion earlier and only now came across > the syscall in linux-next, sorry for stirring things up this late. > >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >> index 94bf4958d114..8f959d90338a 100644 >> --- a/arch/x86/entry/syscalls/syscall_64.tbl >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >> @@ -364,6 +364,7 @@ >> 440 common watch_mount sys_watch_mount >> 441 common watch_sb sys_watch_sb >> 442 common fsinfo sys_fsinfo >> +443 64 process_madvise sys_process_madvise >> >> # >> # x32-specific system call numbers start at 512 to avoid cache impact >> @@ -407,3 +408,4 @@ >> 545 x32 execveat compat_sys_execveat >> 546 x32 preadv2 compat_sys_preadv64v2 >> 547 x32 pwritev2 compat_sys_pwritev64v2 >> +548 x32 process_madvise compat_sys_process_madvise > > I think we should not add any new x32-specific syscalls. Instead I think > the compat_sys_process_madvise/sys_process_madvise can be > merged into one. > >> + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); >> + if (IS_ERR_OR_NULL(mm)) { >> + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; >> + goto release_task; >> + } > > Minor point: Having to use IS_ERR_OR_NULL() tends to be fragile, > and I would try to avoid that. Can mm_access() be changed to > itself return PTR_ERR(-ESRCH) instead of NULL to improve its > calling conventions? I see there are only three other callers. > > >> + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); >> + if (ret >= 0) { >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); >> + kfree(iov); >> + } >> + return ret; >> +} >> + >> +#ifdef CONFIG_COMPAT > ... >> + >> + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), >> + &iov, &iter); >> + if (ret >= 0) { >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); >> + kfree(iov); >> + } > > Every syscall that passes an iovec seems to do this. If we make import_iovec() > handle both cases directly, this syscall and a number of others can > be simplified, and you avoid the x32 entry point I mentioned above > > Something like (untested) > > index dad8d0cfaaf7..0de4ddff24c1 100644 > --- a/lib/iov_iter.c > +++ b/lib/iov_iter.c > @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct > iovec __user * uvector, > { > ssize_t n; > struct iovec *p; > - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, > - *iov, &p); > + > + if (in_compat_syscall()) > + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, > + fast_segs, *iov, &p); > + else > + n = rw_copy_check_uvector(type, uvector, nr_segs, > + fast_segs, *iov, &p); > if (n < 0) { > if (p != *iov) > kfree(p); Doesn't work for the async case, where you want to be holding on to the allocated iovec. But in general I think it's a good helper for the sync case, which is by far the majority.
On Fri, Aug 28, 2020 at 8:24 PM Jens Axboe <axboe@kernel.dk> wrote: > > On 8/28/20 11:40 AM, Arnd Bergmann wrote: > > On Mon, Jun 22, 2020 at 9:29 PM Minchan Kim <minchan@kernel.org> wrote: > >> So finally, the API is as follows, > >> > >> ssize_t process_madvise(int pidfd, const struct iovec *iovec, > >> unsigned long vlen, int advice, unsigned int flags); > > > > I had not followed the discussion earlier and only now came across > > the syscall in linux-next, sorry for stirring things up this late. > > > >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > >> index 94bf4958d114..8f959d90338a 100644 > >> --- a/arch/x86/entry/syscalls/syscall_64.tbl > >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl > >> @@ -364,6 +364,7 @@ > >> 440 common watch_mount sys_watch_mount > >> 441 common watch_sb sys_watch_sb > >> 442 common fsinfo sys_fsinfo > >> +443 64 process_madvise sys_process_madvise > >> > >> # > >> # x32-specific system call numbers start at 512 to avoid cache impact > >> @@ -407,3 +408,4 @@ > >> 545 x32 execveat compat_sys_execveat > >> 546 x32 preadv2 compat_sys_preadv64v2 > >> 547 x32 pwritev2 compat_sys_pwritev64v2 > >> +548 x32 process_madvise compat_sys_process_madvise > > > > I think we should not add any new x32-specific syscalls. Instead I think > > the compat_sys_process_madvise/sys_process_madvise can be > > merged into one. > > > >> + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > >> + if (IS_ERR_OR_NULL(mm)) { > >> + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > >> + goto release_task; > >> + } > > > > Minor point: Having to use IS_ERR_OR_NULL() tends to be fragile, > > and I would try to avoid that. Can mm_access() be changed to > > itself return PTR_ERR(-ESRCH) instead of NULL to improve its > > calling conventions? I see there are only three other callers. > > > > > >> + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); > >> + if (ret >= 0) { > >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); > >> + kfree(iov); > >> + } > >> + return ret; > >> +} > >> + > >> +#ifdef CONFIG_COMPAT > > ... > >> + > >> + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), > >> + &iov, &iter); > >> + if (ret >= 0) { > >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); > >> + kfree(iov); > >> + } > > > > Every syscall that passes an iovec seems to do this. If we make import_iovec() > > handle both cases directly, this syscall and a number of others can > > be simplified, and you avoid the x32 entry point I mentioned above > > > > Something like (untested) > > > > index dad8d0cfaaf7..0de4ddff24c1 100644 > > --- a/lib/iov_iter.c > > +++ b/lib/iov_iter.c > > @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct > > iovec __user * uvector, > > { > > ssize_t n; > > struct iovec *p; > > - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, > > - *iov, &p); > > + > > + if (in_compat_syscall()) I suggested the exact same solutions roughly 1.5 weeks ago. :) Fun when I saw you mentioning this in BBB I knew exactly what you were referring too. :) Christian
On Fri, Aug 28, 2020 at 08:25:34PM +0200, Christian Brauner wrote: > On Fri, Aug 28, 2020 at 8:24 PM Jens Axboe <axboe@kernel.dk> wrote: > > > > On 8/28/20 11:40 AM, Arnd Bergmann wrote: > > > On Mon, Jun 22, 2020 at 9:29 PM Minchan Kim <minchan@kernel.org> wrote: > > >> So finally, the API is as follows, > > >> > > >> ssize_t process_madvise(int pidfd, const struct iovec *iovec, > > >> unsigned long vlen, int advice, unsigned int flags); > > > > > > I had not followed the discussion earlier and only now came across > > > the syscall in linux-next, sorry for stirring things up this late. > > > > > >> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl > > >> index 94bf4958d114..8f959d90338a 100644 > > >> --- a/arch/x86/entry/syscalls/syscall_64.tbl > > >> +++ b/arch/x86/entry/syscalls/syscall_64.tbl > > >> @@ -364,6 +364,7 @@ > > >> 440 common watch_mount sys_watch_mount > > >> 441 common watch_sb sys_watch_sb > > >> 442 common fsinfo sys_fsinfo > > >> +443 64 process_madvise sys_process_madvise > > >> > > >> # > > >> # x32-specific system call numbers start at 512 to avoid cache impact > > >> @@ -407,3 +408,4 @@ > > >> 545 x32 execveat compat_sys_execveat > > >> 546 x32 preadv2 compat_sys_preadv64v2 > > >> 547 x32 pwritev2 compat_sys_pwritev64v2 > > >> +548 x32 process_madvise compat_sys_process_madvise > > > > > > I think we should not add any new x32-specific syscalls. Instead I think > > > the compat_sys_process_madvise/sys_process_madvise can be > > > merged into one. > > > > > >> + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); > > >> + if (IS_ERR_OR_NULL(mm)) { > > >> + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; > > >> + goto release_task; > > >> + } > > > > > > Minor point: Having to use IS_ERR_OR_NULL() tends to be fragile, > > > and I would try to avoid that. Can mm_access() be changed to > > > itself return PTR_ERR(-ESRCH) instead of NULL to improve its > > > calling conventions? I see there are only three other callers. > > > > > > > > >> + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); > > >> + if (ret >= 0) { > > >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); > > >> + kfree(iov); > > >> + } > > >> + return ret; > > >> +} > > >> + > > >> +#ifdef CONFIG_COMPAT > > > ... > > >> + > > >> + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), > > >> + &iov, &iter); > > >> + if (ret >= 0) { > > >> + ret = do_process_madvise(pidfd, &iter, behavior, flags); > > >> + kfree(iov); > > >> + } > > > > > > Every syscall that passes an iovec seems to do this. If we make import_iovec() > > > handle both cases directly, this syscall and a number of others can > > > be simplified, and you avoid the x32 entry point I mentioned above > > > > > > Something like (untested) > > > > > > index dad8d0cfaaf7..0de4ddff24c1 100644 > > > --- a/lib/iov_iter.c > > > +++ b/lib/iov_iter.c > > > @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct > > > iovec __user * uvector, > > > { > > > ssize_t n; > > > struct iovec *p; > > > - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, > > > - *iov, &p); > > > + > > > + if (in_compat_syscall()) > > I suggested the exact same solutions roughly 1.5 weeks ago. :) > Fun when I saw you mentioning this in BBB I knew exactly what you were > referring too. :) > https://lore.kernel.org/linux-man/20200816081227.ngw3l45c5uncesmr@wittgenstein/ Yes, Christian suggested the idea but mostly for only this new syscall. I don't have the time to revise the patchset yet but may have next week. I will follow Christian's suggestion. Thanks.
On 8/28/20 12:24 PM, Jens Axboe wrote: > On 8/28/20 11:40 AM, Arnd Bergmann wrote: >> On Mon, Jun 22, 2020 at 9:29 PM Minchan Kim <minchan@kernel.org> wrote: >>> So finally, the API is as follows, >>> >>> ssize_t process_madvise(int pidfd, const struct iovec *iovec, >>> unsigned long vlen, int advice, unsigned int flags); >> >> I had not followed the discussion earlier and only now came across >> the syscall in linux-next, sorry for stirring things up this late. >> >>> diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl >>> index 94bf4958d114..8f959d90338a 100644 >>> --- a/arch/x86/entry/syscalls/syscall_64.tbl >>> +++ b/arch/x86/entry/syscalls/syscall_64.tbl >>> @@ -364,6 +364,7 @@ >>> 440 common watch_mount sys_watch_mount >>> 441 common watch_sb sys_watch_sb >>> 442 common fsinfo sys_fsinfo >>> +443 64 process_madvise sys_process_madvise >>> >>> # >>> # x32-specific system call numbers start at 512 to avoid cache impact >>> @@ -407,3 +408,4 @@ >>> 545 x32 execveat compat_sys_execveat >>> 546 x32 preadv2 compat_sys_preadv64v2 >>> 547 x32 pwritev2 compat_sys_pwritev64v2 >>> +548 x32 process_madvise compat_sys_process_madvise >> >> I think we should not add any new x32-specific syscalls. Instead I think >> the compat_sys_process_madvise/sys_process_madvise can be >> merged into one. >> >>> + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); >>> + if (IS_ERR_OR_NULL(mm)) { >>> + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; >>> + goto release_task; >>> + } >> >> Minor point: Having to use IS_ERR_OR_NULL() tends to be fragile, >> and I would try to avoid that. Can mm_access() be changed to >> itself return PTR_ERR(-ESRCH) instead of NULL to improve its >> calling conventions? I see there are only three other callers. >> >> >>> + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); >>> + if (ret >= 0) { >>> + ret = do_process_madvise(pidfd, &iter, behavior, flags); >>> + kfree(iov); >>> + } >>> + return ret; >>> +} >>> + >>> +#ifdef CONFIG_COMPAT >> ... >>> + >>> + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), >>> + &iov, &iter); >>> + if (ret >= 0) { >>> + ret = do_process_madvise(pidfd, &iter, behavior, flags); >>> + kfree(iov); >>> + } >> >> Every syscall that passes an iovec seems to do this. If we make import_iovec() >> handle both cases directly, this syscall and a number of others can >> be simplified, and you avoid the x32 entry point I mentioned above >> >> Something like (untested) >> >> index dad8d0cfaaf7..0de4ddff24c1 100644 >> --- a/lib/iov_iter.c >> +++ b/lib/iov_iter.c >> @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct >> iovec __user * uvector, >> { >> ssize_t n; >> struct iovec *p; >> - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, >> - *iov, &p); >> + >> + if (in_compat_syscall()) >> + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, >> + fast_segs, *iov, &p); >> + else >> + n = rw_copy_check_uvector(type, uvector, nr_segs, >> + fast_segs, *iov, &p); >> if (n < 0) { >> if (p != *iov) >> kfree(p); > > Doesn't work for the async case, where you want to be holding on to the > allocated iovec. But in general I think it's a good helper for the sync > case, which is by far the majority. Nevermind, I'm an idiot for reading this totally wrong.
On Fri, Aug 28, 2020 at 9:27 PM Jens Axboe <axboe@kernel.dk> wrote: > On 8/28/20 12:24 PM, Jens Axboe wrote: > >> @@ -1683,8 +1683,13 @@ ssize_t import_iovec(int type, const struct > >> iovec __user * uvector, > >> { > >> ssize_t n; > >> struct iovec *p; > >> - n = rw_copy_check_uvector(type, uvector, nr_segs, fast_segs, > >> - *iov, &p); > >> + > >> + if (in_compat_syscall()) > >> + n = compat_rw_copy_check_uvector(type, uvector, nr_segs, > >> + fast_segs, *iov, &p); > >> + else > >> + n = rw_copy_check_uvector(type, uvector, nr_segs, > >> + fast_segs, *iov, &p); > >> if (n < 0) { > >> if (p != *iov) > >> kfree(p); > > > > Doesn't work for the async case, where you want to be holding on to the > > allocated iovec. But in general I think it's a good helper for the sync > > case, which is by far the majority. > > Nevermind, I'm an idiot for reading this totally wrong. > I think you are right about the need to pick the compat vs native behavior based on req->ctx->compat instead of in_compat_syscall() inside of io_import_iovec(). That one can probably call a lower-level version and when all other callers get changed to calling ssize_t import_iovec(int type, const struct iovec __user * uvector, unsigned nr_segs, unsigned fast_segs, struct iovec **iov, struct iov_iter *i) { return __import_iovec(type, uvector, nr_segs, fast_segs, iov, i, in_compat_syscall()); } Arnd
On Fri, Aug 28, 2020 at 07:40:08PM +0200, Arnd Bergmann wrote: > Every syscall that passes an iovec seems to do this. If we make import_iovec() > handle both cases directly, this syscall and a number of others can > be simplified, and you avoid the x32 entry point I mentioned above FYI, I do have a series that does this (even tested) and kills tons of compat syscalls by that. But by doing that I found the problem that compat syscalls issued by io_uring don't trigger in_compat_syscall(). I need to go back to fixing the io_uring vs in_compat_syscall() issue (probably for 5.9) and then submit the whole thing.
diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index 0aea820a4851..2e156975f573 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -482,3 +482,4 @@ 550 common watch_mount sys_watch_mount 551 common watch_sb sys_watch_sb 552 common fsinfo sys_fsinfo +553 common process_madvise sys_process_madvise diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 74fec675e2fe..b166a5383a60 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -456,3 +456,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 949788f5ba40..d1f7d35f986e 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 443 +#define __NR_compat_syscalls 444 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 629fe05d7e7d..a377aff42d39 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -893,6 +893,8 @@ __SYSCALL(__NR_watch_mount, sys_watch_mount) __SYSCALL(__NR_watch_sb, sys_watch_sb) #define __NR_fsinfo 442 __SYSCALL(__NR_fsinfo, sys_fsinfo) +#define __NR_fsinfo 443 +__SYSCALL(__NR_process_madvise, compat_sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 7c9e0dba2647..71337e11f01c 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -363,3 +363,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 0516e5eee098..460fb0f9bb4b 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 669584129d71..a95897a4ea76 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -448,3 +448,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 2aac2722ca74..5ede2681f4e1 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -381,3 +381,4 @@ 440 n32 watch_mount sys_watch_mount 441 n32 watch_sb sys_watch_sb 442 n32 fsinfo sys_fsinfo +443 n32 process_madvise compat_sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 1f854c23c5b5..daa607c4afe6 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -357,3 +357,4 @@ 440 n64 watch_mount sys_watch_mount 441 n64 watch_sb sys_watch_sb 442 n64 fsinfo sys_fsinfo +443 n64 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 0b59ec2dbfcb..0dffd81fb345 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -430,3 +430,4 @@ 440 o32 watch_mount sys_watch_mount 441 o32 watch_sb sys_watch_sb 442 o32 fsinfo sys_fsinfo +443 o32 process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 68d10778b7ae..09ac0b4aac30 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -440,3 +440,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 1adfad158267..3a1fecc30987 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -487,3 +487,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 9104f034129d..068310185c50 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 440 common watch_mount sys_watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index 1ce9c9473904..792539111ed8 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 7c0d97dffd35..4f8eebfcd07e 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -488,3 +488,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index f1295eae4ba8..29e49a70c99e 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -447,3 +447,4 @@ 440 i386 watch_mount sys_watch_mount 441 i386 watch_sb sys_watch_sb 442 i386 fsinfo sys_fsinfo +443 i386 process_madvise sys_process_madvise compat_sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 94bf4958d114..8f959d90338a 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -364,6 +364,7 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 64 process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact @@ -407,3 +408,4 @@ 545 x32 execveat compat_sys_execveat 546 x32 preadv2 compat_sys_preadv64v2 547 x32 pwritev2 compat_sys_pwritev64v2 +548 x32 process_madvise compat_sys_process_madvise diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 7eb1d01127f4..173bd27f61dd 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -413,3 +413,4 @@ 440 common watch_mount sys_watch_mount 441 common watch_sb sys_watch_sb 442 common fsinfo sys_fsinfo +443 common process_madvise sys_process_madvise diff --git a/include/linux/compat.h b/include/linux/compat.h index 605a95fc5b31..4b48b6c49637 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -827,6 +827,10 @@ asmlinkage long compat_sys_pwritev64v2(unsigned long fd, unsigned long vlen, loff_t pos, rwf_t flags); #endif +asmlinkage ssize_t compat_sys_process_madvise(compat_int_t pidfd, + const struct compat_iovec __user *vec, + compat_ulong_t vlen, compat_int_t behavior, + compat_uint_t flags); /* * Deprecated system calls which are still defined in diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3b922deee72e..35cd2c0e7665 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -880,6 +880,8 @@ asmlinkage long sys_munlockall(void); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + unsigned long vlen, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 25b1bdfb3e97..367cf21d0292 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -865,9 +865,11 @@ __SYSCALL(__NR_watch_mount, sys_watch_mount) __SYSCALL(__NR_watch_sb, sys_watch_sb) #define __NR_fsinfo 442 __SYSCALL(__NR_fsinfo, sys_fsinfo) +#define __NR_fsinfo 443 +__SC_COMP(__NR_process_madvise, sys_process_madvise, compat_sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 443 +#define __NR_syscalls 444 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index f51a1e1a3c32..c935c1819ba3 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -287,6 +287,8 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); +COND_SYSCALL_COMPAT(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/madvise.c b/mm/madvise.c index 551ed816eefe..23abca3f93fa 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,7 @@ #include <linux/falloc.h> #include <linux/fadvise.h> #include <linux/sched.h> +#include <linux/sched/mm.h> #include <linux/ksm.h> #include <linux/fs.h> #include <linux/file.h> @@ -995,6 +996,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1042,6 +1055,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1176,3 +1194,106 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current, current->mm, start, len_in, behavior); } + +static int process_madvise_vec(struct task_struct *target_task, + struct mm_struct *mm, struct iov_iter *iter, int behavior) +{ + struct iovec iovec; + int ret = 0; + + while (iov_iter_count(iter)) { + iovec = iov_iter_iovec(iter); + ret = do_madvise(target_task, mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iovec.iov_len); + } + + return ret; +} + +static ssize_t do_process_madvise(int pidfd, struct iov_iter *iter, + int behavior, unsigned int flags) +{ + ssize_t ret; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len = iov_iter_count(iter); + + if (flags != 0) + return -EINVAL; + + pid = pidfd_get_pid(pidfd); + if (IS_ERR(pid)) + return PTR_ERR(pid); + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + ret = process_madvise_vec(task, mm, iter, behavior); + if (ret >= 0) + ret = total_len - iov_iter_count(iter); + + mmput(mm); +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); + return ret; +} + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + unsigned long, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret >= 0) { + ret = do_process_madvise(pidfd, &iter, behavior, flags); + kfree(iov); + } + return ret; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE5(process_madvise, compat_int_t, pidfd, + const struct compat_iovec __user *, vec, + compat_ulong_t, vlen, + compat_int_t, behavior, + compat_uint_t, flags) + +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + + ret = compat_import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), + &iov, &iter); + if (ret >= 0) { + ret = do_process_madvise(pidfd, &iter, behavior, flags); + kfree(iov); + } + return ret; +} +#endif