Message ID | 20220621144205.158452-1-iii@linux.ibm.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | [v2] linux-user: Add partial support for MADV_DONTNEED | expand |
Le 21/06/2022 à 16:42, Ilya Leoshkevich a écrit : > Currently QEMU ignores madvise(MADV_DONTNEED), which break apps that > rely on this for zeroing out memory [1]. Improve the situation by doing > a passthrough when the range in question is a host-page-aligned > anonymous mapping. > > This is based on the patches from Simon Hausmann [2] and Chris Fallin > [3]. The structure is taken from Simon's patch. The PAGE_MAP_ANONYMOUS > bits are superseded by commit 26bab757d41b ("linux-user: Introduce > PAGE_ANON"). In the end the patch acts like the one from Chris: we > either pass-through the entire syscall, or do nothing, since doing this > only partially would not help the affected applications much. Finally, > add some extra checks to match the behavior of the Linux kernel [4]. > > [1] https://gitlab.com/qemu-project/qemu/-/issues/326 > [2] https://patchew.org/QEMU/20180827084037.25316-1-simon.hausmann@qt.io/ > [3] https://github.com/bytecodealliance/wasmtime/blob/v0.37.0/ci/qemu-madvise.patch > [4] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/madvise.c?h=v5.19-rc3#n1368 > > Signed-off-by: Ilya Leoshkevich <iii@linux.ibm.com> > Reviewed-by: Laurent Vivier <laurent@vivier.eu> > --- > > v1: https://lists.gnu.org/archive/html/qemu-devel/2022-06/msg03572.html > v1 -> v2: > * Make get_errno() extern. > * Simplify errno handling (Laurent). > > linux-user/mmap.c | 64 +++++++++++++++++++++++++++++++++++++ > linux-user/syscall.c | 8 ++--- > linux-user/user-internals.h | 1 + > linux-user/user-mmap.h | 1 + > 4 files changed, 68 insertions(+), 6 deletions(-) > > diff --git a/linux-user/mmap.c b/linux-user/mmap.c > index 48e1373796..4e7a6be6ee 100644 > --- a/linux-user/mmap.c > +++ b/linux-user/mmap.c > @@ -835,3 +835,67 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, > mmap_unlock(); > return new_addr; > } > + > +static bool can_passthrough_madv_dontneed(abi_ulong start, abi_ulong end) > +{ > + ulong addr; > + > + if ((start | end) & ~qemu_host_page_mask) { > + return false; > + } > + > + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { > + if (!(page_get_flags(addr) & PAGE_ANON)) { > + return false; > + } > + } > + > + return true; > +} > + > +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice) > +{ > + abi_ulong len, end; > + int ret = 0; > + > + if (start & ~TARGET_PAGE_MASK) { > + return -TARGET_EINVAL; > + } > + len = TARGET_PAGE_ALIGN(len_in); > + > + if (len_in && !len) { > + return -TARGET_EINVAL; > + } > + > + end = start + len; > + if (end < start) { > + return -TARGET_EINVAL; > + } > + > + if (end == start) { > + return 0; > + } > + > + if (!guest_range_valid_untagged(start, len)) { > + return -TARGET_EINVAL; > + } > + > + /* > + * A straight passthrough may not be safe because qemu sometimes turns > + * private file-backed mappings into anonymous mappings. > + * > + * This is a hint, so ignoring and returning success is ok. > + * > + * This breaks MADV_DONTNEED, completely implementing which is quite > + * complicated. However, there is one low-hanging fruit: host-page-aligned > + * anonymous mappings. In this case passthrough is safe, so do it. > + */ > + mmap_lock(); > + if ((advice & MADV_DONTNEED) && > + can_passthrough_madv_dontneed(start, end)) { > + ret = get_errno(madvise(g2h_untagged(start), len, MADV_DONTNEED)); > + } > + mmap_unlock(); > + > + return ret; > +} > diff --git a/linux-user/syscall.c b/linux-user/syscall.c > index f55cdebee5..8f68f255c0 100644 > --- a/linux-user/syscall.c > +++ b/linux-user/syscall.c > @@ -538,7 +538,7 @@ static inline int target_to_host_errno(int target_errno) > } > } > > -static inline abi_long get_errno(abi_long ret) > +abi_long get_errno(abi_long ret) > { > if (ret == -1) > return -host_to_target_errno(errno); > @@ -11807,11 +11807,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, > > #ifdef TARGET_NR_madvise > case TARGET_NR_madvise: > - /* A straight passthrough may not be safe because qemu sometimes > - turns private file-backed mappings into anonymous mappings. > - This will break MADV_DONTNEED. > - This is a hint, so ignoring and returning success is ok. */ > - return 0; > + return target_madvise(arg1, arg2, arg3); > #endif > #ifdef TARGET_NR_fcntl64 > case TARGET_NR_fcntl64: > diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h > index 6175ce53db..0280e76add 100644 > --- a/linux-user/user-internals.h > +++ b/linux-user/user-internals.h > @@ -65,6 +65,7 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1, > abi_long arg8); > extern __thread CPUState *thread_cpu; > G_NORETURN void cpu_loop(CPUArchState *env); > +abi_long get_errno(abi_long ret); > const char *target_strerror(int err); > int get_osversion(void); > void init_qemu_uname_release(void); > diff --git a/linux-user/user-mmap.h b/linux-user/user-mmap.h > index d1dec99c02..480ce1c114 100644 > --- a/linux-user/user-mmap.h > +++ b/linux-user/user-mmap.h > @@ -25,6 +25,7 @@ int target_munmap(abi_ulong start, abi_ulong len); > abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, > abi_ulong new_size, unsigned long flags, > abi_ulong new_addr); > +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice); > extern unsigned long last_brk; > extern abi_ulong mmap_next_start; > abi_ulong mmap_find_vma(abi_ulong, abi_ulong, abi_ulong); Applied to my linux-user-for-7.1 branch. Thanks, Laurent
diff --git a/linux-user/mmap.c b/linux-user/mmap.c index 48e1373796..4e7a6be6ee 100644 --- a/linux-user/mmap.c +++ b/linux-user/mmap.c @@ -835,3 +835,67 @@ abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, mmap_unlock(); return new_addr; } + +static bool can_passthrough_madv_dontneed(abi_ulong start, abi_ulong end) +{ + ulong addr; + + if ((start | end) & ~qemu_host_page_mask) { + return false; + } + + for (addr = start; addr < end; addr += TARGET_PAGE_SIZE) { + if (!(page_get_flags(addr) & PAGE_ANON)) { + return false; + } + } + + return true; +} + +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice) +{ + abi_ulong len, end; + int ret = 0; + + if (start & ~TARGET_PAGE_MASK) { + return -TARGET_EINVAL; + } + len = TARGET_PAGE_ALIGN(len_in); + + if (len_in && !len) { + return -TARGET_EINVAL; + } + + end = start + len; + if (end < start) { + return -TARGET_EINVAL; + } + + if (end == start) { + return 0; + } + + if (!guest_range_valid_untagged(start, len)) { + return -TARGET_EINVAL; + } + + /* + * A straight passthrough may not be safe because qemu sometimes turns + * private file-backed mappings into anonymous mappings. + * + * This is a hint, so ignoring and returning success is ok. + * + * This breaks MADV_DONTNEED, completely implementing which is quite + * complicated. However, there is one low-hanging fruit: host-page-aligned + * anonymous mappings. In this case passthrough is safe, so do it. + */ + mmap_lock(); + if ((advice & MADV_DONTNEED) && + can_passthrough_madv_dontneed(start, end)) { + ret = get_errno(madvise(g2h_untagged(start), len, MADV_DONTNEED)); + } + mmap_unlock(); + + return ret; +} diff --git a/linux-user/syscall.c b/linux-user/syscall.c index f55cdebee5..8f68f255c0 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -538,7 +538,7 @@ static inline int target_to_host_errno(int target_errno) } } -static inline abi_long get_errno(abi_long ret) +abi_long get_errno(abi_long ret) { if (ret == -1) return -host_to_target_errno(errno); @@ -11807,11 +11807,7 @@ static abi_long do_syscall1(CPUArchState *cpu_env, int num, abi_long arg1, #ifdef TARGET_NR_madvise case TARGET_NR_madvise: - /* A straight passthrough may not be safe because qemu sometimes - turns private file-backed mappings into anonymous mappings. - This will break MADV_DONTNEED. - This is a hint, so ignoring and returning success is ok. */ - return 0; + return target_madvise(arg1, arg2, arg3); #endif #ifdef TARGET_NR_fcntl64 case TARGET_NR_fcntl64: diff --git a/linux-user/user-internals.h b/linux-user/user-internals.h index 6175ce53db..0280e76add 100644 --- a/linux-user/user-internals.h +++ b/linux-user/user-internals.h @@ -65,6 +65,7 @@ abi_long do_syscall(CPUArchState *cpu_env, int num, abi_long arg1, abi_long arg8); extern __thread CPUState *thread_cpu; G_NORETURN void cpu_loop(CPUArchState *env); +abi_long get_errno(abi_long ret); const char *target_strerror(int err); int get_osversion(void); void init_qemu_uname_release(void); diff --git a/linux-user/user-mmap.h b/linux-user/user-mmap.h index d1dec99c02..480ce1c114 100644 --- a/linux-user/user-mmap.h +++ b/linux-user/user-mmap.h @@ -25,6 +25,7 @@ int target_munmap(abi_ulong start, abi_ulong len); abi_long target_mremap(abi_ulong old_addr, abi_ulong old_size, abi_ulong new_size, unsigned long flags, abi_ulong new_addr); +abi_long target_madvise(abi_ulong start, abi_ulong len_in, int advice); extern unsigned long last_brk; extern abi_ulong mmap_next_start; abi_ulong mmap_find_vma(abi_ulong, abi_ulong, abi_ulong);