Message ID | 1529364856-49589-3-git-send-email-yang.shi@linux.alibaba.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
On Tue, Jun 19, 2018 at 07:34:16AM +0800, Yang Shi wrote: > diff --git a/mm/mmap.c b/mm/mmap.c > index fc41c05..e84f80c 100644 > --- a/mm/mmap.c > +++ b/mm/mmap.c > @@ -2686,6 +2686,141 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, > return __split_vma(mm, vma, addr, new_below); > } > > +/* Consider PUD size or 1GB mapping as large mapping */ > +#ifdef HPAGE_PUD_SIZE > +#define LARGE_MAP_THRESH HPAGE_PUD_SIZE > +#else > +#define LARGE_MAP_THRESH (1 * 1024 * 1024 * 1024) > +#endif > + > +/* Unmap large mapping early with acquiring read mmap_sem */ > +static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start, > + size_t len, struct list_head *uf) > +{ > + unsigned long end = 0; > + struct vm_area_struct *vma = NULL, *prev, *last, *tmp; > + bool success = false; > + int ret = 0; > + > + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start) > + return -EINVAL; > + > + len = (PAGE_ALIGN(len)); > + if (len == 0) > + return -EINVAL; > + > + /* Just deal with uf in regular path */ > + if (unlikely(uf)) > + goto regular_path; > + > + if (len >= LARGE_MAP_THRESH) { > + down_read(&mm->mmap_sem); > + vma = find_vma(mm, start); > + if (!vma) { > + up_read(&mm->mmap_sem); > + return 0; > + } > + > + prev = vma->vm_prev; > + > + end = start + len; > + if (vma->vm_start > end) { > + up_read(&mm->mmap_sem); > + return 0; > + } > + > + if (start > vma->vm_start) { > + int error; > + > + if (end < vma->vm_end && > + mm->map_count > sysctl_max_map_count) { > + up_read(&mm->mmap_sem); > + return -ENOMEM; > + } > + > + error = __split_vma(mm, vma, start, 0); > + if (error) { > + up_read(&mm->mmap_sem); > + return error; > + } > + prev = vma; > + } > + > + last = find_vma(mm, end); > + if (last && end > last->vm_start) { > + int error = __split_vma(mm, last, end, 1); > + > + if (error) { > + up_read(&mm->mmap_sem); > + return error; > + } > + } > + vma = prev ? prev->vm_next : mm->mmap; Hold up, two things: you having to copy most of do_munmap() didn't seem to suggest a helper function? And second, since when are we allowed to split VMAs under a read lock?
On 6/19/18 3:02 AM, Peter Zijlstra wrote: > On Tue, Jun 19, 2018 at 07:34:16AM +0800, Yang Shi wrote: > >> diff --git a/mm/mmap.c b/mm/mmap.c >> index fc41c05..e84f80c 100644 >> --- a/mm/mmap.c >> +++ b/mm/mmap.c >> @@ -2686,6 +2686,141 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, >> return __split_vma(mm, vma, addr, new_below); >> } >> >> +/* Consider PUD size or 1GB mapping as large mapping */ >> +#ifdef HPAGE_PUD_SIZE >> +#define LARGE_MAP_THRESH HPAGE_PUD_SIZE >> +#else >> +#define LARGE_MAP_THRESH (1 * 1024 * 1024 * 1024) >> +#endif >> + >> +/* Unmap large mapping early with acquiring read mmap_sem */ >> +static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start, >> + size_t len, struct list_head *uf) >> +{ >> + unsigned long end = 0; >> + struct vm_area_struct *vma = NULL, *prev, *last, *tmp; >> + bool success = false; >> + int ret = 0; >> + >> + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start) >> + return -EINVAL; >> + >> + len = (PAGE_ALIGN(len)); >> + if (len == 0) >> + return -EINVAL; >> + >> + /* Just deal with uf in regular path */ >> + if (unlikely(uf)) >> + goto regular_path; >> + >> + if (len >= LARGE_MAP_THRESH) { >> + down_read(&mm->mmap_sem); >> + vma = find_vma(mm, start); >> + if (!vma) { >> + up_read(&mm->mmap_sem); >> + return 0; >> + } >> + >> + prev = vma->vm_prev; >> + >> + end = start + len; >> + if (vma->vm_start > end) { >> + up_read(&mm->mmap_sem); >> + return 0; >> + } >> + >> + if (start > vma->vm_start) { >> + int error; >> + >> + if (end < vma->vm_end && >> + mm->map_count > sysctl_max_map_count) { >> + up_read(&mm->mmap_sem); >> + return -ENOMEM; >> + } >> + >> + error = __split_vma(mm, vma, start, 0); >> + if (error) { >> + up_read(&mm->mmap_sem); >> + return error; >> + } >> + prev = vma; >> + } >> + >> + last = find_vma(mm, end); >> + if (last && end > last->vm_start) { >> + int error = __split_vma(mm, last, end, 1); >> + >> + if (error) { >> + up_read(&mm->mmap_sem); >> + return error; >> + } >> + } >> + vma = prev ? prev->vm_next : mm->mmap; > Hold up, two things: you having to copy most of do_munmap() didn't seem > to suggest a helper function? And second, since when are we allowed to Yes, they will be extracted into a helper function in the next version. May bad, I don't think it is allowed. We could reform this to: acquire write mmap_sem vma lookup (split vmas) release write mmap_sem acquire read mmap_sem zap pages release read mmap_sem I'm supposed this is safe as what Michal said before. Thanks, Yang > split VMAs under a read lock?
at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: > When running some mmap/munmap scalability tests with large memory (i.e. >> 300GB), the below hung task issue may happen occasionally. > > INFO: task ps:14018 blocked for more than 120 seconds. > Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 > "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this > message. > ps D 0 14018 1 0x00000004 > (snip) > > Zapping pages is the most time consuming part, according to the > suggestion from Michal Hock [1], zapping pages can be done with holding > read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write > mmap_sem to manipulate vmas. Does munmap() == MADV_DONTNEED + munmap() ? For example, what happens with userfaultfd in this case? Can you get an extra #PF, which would be visible to userspace, before the munmap is finished? In addition, would it be ok for the user to potentially get a zeroed page in the time window after the MADV_DONTNEED finished removing a PTE and before the munmap() is done? Regards, Nadav
On 6/19/18 3:17 PM, Nadav Amit wrote: > at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: > >> When running some mmap/munmap scalability tests with large memory (i.e. >>> 300GB), the below hung task issue may happen occasionally. >> INFO: task ps:14018 blocked for more than 120 seconds. >> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 >> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >> message. >> ps D 0 14018 1 0x00000004 >> > (snip) > >> Zapping pages is the most time consuming part, according to the >> suggestion from Michal Hock [1], zapping pages can be done with holding >> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write >> mmap_sem to manipulate vmas. > Does munmap() == MADV_DONTNEED + munmap() ? Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. > > For example, what happens with userfaultfd in this case? Can you get an > extra #PF, which would be visible to userspace, before the munmap is > finished? userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. > > In addition, would it be ok for the user to potentially get a zeroed page in > the time window after the MADV_DONTNEED finished removing a PTE and before > the munmap() is done? This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. Thanks, Yang > > Regards, > Nadav
at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: > > > On 6/19/18 3:17 PM, Nadav Amit wrote: >> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> >> wrote: >> >> >>> When running some mmap/munmap scalability tests with large memory (i.e. >>> >>>> 300GB), the below hung task issue may happen occasionally. >>>> >>> INFO: task ps:14018 blocked for more than 120 seconds. >>> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 >>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>> message. >>> ps D 0 14018 1 0x00000004 >>> >>> >> (snip) >> >> >>> Zapping pages is the most time consuming part, according to the >>> suggestion from Michal Hock [1], zapping pages can be done with holding >>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write >>> mmap_sem to manipulate vmas. >>> >> Does munmap() == MADV_DONTNEED + munmap() ? > > Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. > >> >> For example, what happens with userfaultfd in this case? Can you get an >> extra #PF, which would be visible to userspace, before the munmap is >> finished? >> > > userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. Right. I see it now. > >> >> In addition, would it be ok for the user to potentially get a zeroed page in >> the time window after the MADV_DONTNEED finished removing a PTE and before >> the munmap() is done? >> > > This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. Thanks for the reference. Reading the man page I see: "All pages containing a part of the indicated range are unmapped, and subsequent references to these pages will generate SIGSEGV.” To me it sounds pretty well-defined, and this implementation does not follow this definition. I would expect the man page to be updated and indicate that the behavior has changed. Regards, Nadav
On Tue 19-06-18 14:13:05, Yang Shi wrote: > > > On 6/19/18 3:02 AM, Peter Zijlstra wrote: [...] > > Hold up, two things: you having to copy most of do_munmap() didn't seem > > to suggest a helper function? And second, since when are we allowed to > > Yes, they will be extracted into a helper function in the next version. > > May bad, I don't think it is allowed. We could reform this to: > > acquire write mmap_sem > vma lookup (split vmas) > release write mmap_sem > > acquire read mmap_sem > zap pages > release read mmap_sem > > I'm supposed this is safe as what Michal said before. I didn't get to read your patches carefully yet but I am wondering why do you need to split in the first place. Why cannot you simply unmap the range (madvise(DONTNEED)) under the read lock and then take the lock for write to finish the rest?
On Tue 19-06-18 17:31:27, Nadav Amit wrote: > at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: > > > > > > > On 6/19/18 3:17 PM, Nadav Amit wrote: > >> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> > >> wrote: > >> > >> > >>> When running some mmap/munmap scalability tests with large memory (i.e. > >>> > >>>> 300GB), the below hung task issue may happen occasionally. > >>>> > >>> INFO: task ps:14018 blocked for more than 120 seconds. > >>> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 > >>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this > >>> message. > >>> ps D 0 14018 1 0x00000004 > >>> > >>> > >> (snip) > >> > >> > >>> Zapping pages is the most time consuming part, according to the > >>> suggestion from Michal Hock [1], zapping pages can be done with holding > >>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write > >>> mmap_sem to manipulate vmas. > >>> > >> Does munmap() == MADV_DONTNEED + munmap() ? > > > > Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. > > > >> > >> For example, what happens with userfaultfd in this case? Can you get an > >> extra #PF, which would be visible to userspace, before the munmap is > >> finished? > >> > > > > userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. > > Right. I see it now. > > > > >> > >> In addition, would it be ok for the user to potentially get a zeroed page in > >> the time window after the MADV_DONTNEED finished removing a PTE and before > >> the munmap() is done? > >> > > > > This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. > > Thanks for the reference. > > Reading the man page I see: "All pages containing a part of the indicated > range are unmapped, and subsequent references to these pages will generate > SIGSEGV.” Yes, this is true but I guess what Yang Shi meant was that an userspace access racing with munmap is not well defined. You never know whether you get your data, #PTF or SEGV because it depends on timing. The user visible change might be that you lose content and get zero page instead if you hit the race window while we are unmapping which was not possible before. But whouldn't such an access pattern be buggy anyway? You need some form of external synchronization AFAICS. But maybe some userspace depends on "getting right data or get SEGV" semantic. If we have to preserve that then we can come up with a VM_DEAD flag set before we tear it down and force the SEGV on the #PF path. Something similar we already do for MMF_UNSTABLE.
On 6/20/18 12:17 AM, Michal Hocko wrote: > On Tue 19-06-18 14:13:05, Yang Shi wrote: >> >> On 6/19/18 3:02 AM, Peter Zijlstra wrote: > [...] >>> Hold up, two things: you having to copy most of do_munmap() didn't seem >>> to suggest a helper function? And second, since when are we allowed to >> Yes, they will be extracted into a helper function in the next version. >> >> May bad, I don't think it is allowed. We could reform this to: >> >> acquire write mmap_sem >> vma lookup (split vmas) >> release write mmap_sem >> >> acquire read mmap_sem >> zap pages >> release read mmap_sem >> >> I'm supposed this is safe as what Michal said before. > I didn't get to read your patches carefully yet but I am wondering why > do you need to split in the first place. Why cannot you simply unmap the > range (madvise(DONTNEED)) under the read lock and then take the lock for > write to finish the rest? Yes, we can. I just thought splitting vma up-front sounds more straight forward. But, I neglected the write mmap_sem issue. Will move the vma split into later write mmap_sem in the next version. Thanks, Yang
at 12:18 AM, Michal Hocko <mhocko@kernel.org> wrote: > On Tue 19-06-18 17:31:27, Nadav Amit wrote: >> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: >> >>> On 6/19/18 3:17 PM, Nadav Amit wrote: >>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> >>>> wrote: >>>> >>>> >>>>> When running some mmap/munmap scalability tests with large memory (i.e. >>>>> >>>>>> 300GB), the below hung task issue may happen occasionally. >>>>> INFO: task ps:14018 blocked for more than 120 seconds. >>>>> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 >>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>>>> message. >>>>> ps D 0 14018 1 0x00000004 >>>> (snip) >>>> >>>> >>>>> Zapping pages is the most time consuming part, according to the >>>>> suggestion from Michal Hock [1], zapping pages can be done with holding >>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write >>>>> mmap_sem to manipulate vmas. >>>> Does munmap() == MADV_DONTNEED + munmap() ? >>> >>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. >>> >>>> For example, what happens with userfaultfd in this case? Can you get an >>>> extra #PF, which would be visible to userspace, before the munmap is >>>> finished? >>> >>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. >> >> Right. I see it now. >> >>>> In addition, would it be ok for the user to potentially get a zeroed page in >>>> the time window after the MADV_DONTNEED finished removing a PTE and before >>>> the munmap() is done? >>> >>> This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. >> >> Thanks for the reference. >> >> Reading the man page I see: "All pages containing a part of the indicated >> range are unmapped, and subsequent references to these pages will generate >> SIGSEGV.” > > Yes, this is true but I guess what Yang Shi meant was that an userspace > access racing with munmap is not well defined. You never know whether > you get your data, #PTF or SEGV because it depends on timing. The user > visible change might be that you lose content and get zero page instead > if you hit the race window while we are unmapping which was not possible > before. But whouldn't such an access pattern be buggy anyway? You need > some form of external synchronization AFAICS. It seems to follow the specifications, so it is not clearly buggy IMHO. I don’t know of such a use-case, but if somebody does so - the proposed change might even cause a security vulnerability. > But maybe some userspace depends on "getting right data or get SEGV" > semantic. If we have to preserve that then we can come up with a VM_DEAD > flag set before we tear it down and force the SEGV on the #PF path. > Something similar we already do for MMF_UNSTABLE. That seems reasonable. Regards, Nadav
On 6/20/18 12:18 AM, Michal Hocko wrote: > On Tue 19-06-18 17:31:27, Nadav Amit wrote: >> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: >> >>> >>> On 6/19/18 3:17 PM, Nadav Amit wrote: >>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> >>>> wrote: >>>> >>>> >>>>> When running some mmap/munmap scalability tests with large memory (i.e. >>>>> >>>>>> 300GB), the below hung task issue may happen occasionally. >>>>>> >>>>> INFO: task ps:14018 blocked for more than 120 seconds. >>>>> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 >>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>>>> message. >>>>> ps D 0 14018 1 0x00000004 >>>>> >>>>> >>>> (snip) >>>> >>>> >>>>> Zapping pages is the most time consuming part, according to the >>>>> suggestion from Michal Hock [1], zapping pages can be done with holding >>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write >>>>> mmap_sem to manipulate vmas. >>>>> >>>> Does munmap() == MADV_DONTNEED + munmap() ? >>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. >>> >>>> For example, what happens with userfaultfd in this case? Can you get an >>>> extra #PF, which would be visible to userspace, before the munmap is >>>> finished? >>>> >>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. >> Right. I see it now. >> >>>> In addition, would it be ok for the user to potentially get a zeroed page in >>>> the time window after the MADV_DONTNEED finished removing a PTE and before >>>> the munmap() is done? >>>> >>> This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. >> Thanks for the reference. >> >> Reading the man page I see: "All pages containing a part of the indicated >> range are unmapped, and subsequent references to these pages will generate >> SIGSEGV.” > Yes, this is true but I guess what Yang Shi meant was that an userspace > access racing with munmap is not well defined. You never know whether > you get your data, #PTF or SEGV because it depends on timing. The user > visible change might be that you lose content and get zero page instead > if you hit the race window while we are unmapping which was not possible > before. But whouldn't such an access pattern be buggy anyway? You need > some form of external synchronization AFAICS. > > But maybe some userspace depends on "getting right data or get SEGV" > semantic. If we have to preserve that then we can come up with a VM_DEAD > flag set before we tear it down and force the SEGV on the #PF path. > Something similar we already do for MMF_UNSTABLE. Set VM_DEAD with read mmap_sem held? It should be ok since this is the only place to set this flag for this unique special case. Yang
Yes, this is true but I guess what Yang Shi meant was that an userspace >> access racing with munmap is not well defined. You never know whether >> you get your data, #PTF or SEGV because it depends on timing. The user >> visible change might be that you lose content and get zero page instead >> if you hit the race window while we are unmapping which was not possible >> before. But whouldn't such an access pattern be buggy anyway? You need >> some form of external synchronization AFAICS. >> >> But maybe some userspace depends on "getting right data or get SEGV" >> semantic. If we have to preserve that then we can come up with a VM_DEAD >> flag set before we tear it down and force the SEGV on the #PF path. >> Something similar we already do for MMF_UNSTABLE. > > Set VM_DEAD with read mmap_sem held? It should be ok since this is the > only place to set this flag for this unique special case. BTW, it looks the vm flags have used up in 32 bit. If we really need VM_DEAD, it should be for both 32-bit and 64-bit. Any suggestion? Thanks, Yang > > Yang > >
On Fri 22-06-18 18:01:08, Yang Shi wrote: > Yes, this is true but I guess what Yang Shi meant was that an userspace > > > access racing with munmap is not well defined. You never know whether > > > you get your data, #PTF or SEGV because it depends on timing. The user > > > visible change might be that you lose content and get zero page instead > > > if you hit the race window while we are unmapping which was not possible > > > before. But whouldn't such an access pattern be buggy anyway? You need > > > some form of external synchronization AFAICS. > > > > > > But maybe some userspace depends on "getting right data or get SEGV" > > > semantic. If we have to preserve that then we can come up with a VM_DEAD > > > flag set before we tear it down and force the SEGV on the #PF path. > > > Something similar we already do for MMF_UNSTABLE. > > > > Set VM_DEAD with read mmap_sem held? It should be ok since this is the > > only place to set this flag for this unique special case. > > BTW, it looks the vm flags have used up in 32 bit. If we really need > VM_DEAD, it should be for both 32-bit and 64-bit. Do we really need any special handling for 32b? Who is going to create GB mappings for all this to be worth doing?
On 6/20/18 12:18 AM, Michal Hocko wrote: > On Tue 19-06-18 17:31:27, Nadav Amit wrote: >> at 4:08 PM, Yang Shi <yang.shi@linux.alibaba.com> wrote: >> >>> >>> On 6/19/18 3:17 PM, Nadav Amit wrote: >>>> at 4:34 PM, Yang Shi <yang.shi@linux.alibaba.com> >>>> wrote: >>>> >>>> >>>>> When running some mmap/munmap scalability tests with large memory (i.e. >>>>> >>>>>> 300GB), the below hung task issue may happen occasionally. >>>>>> >>>>> INFO: task ps:14018 blocked for more than 120 seconds. >>>>> Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 >>>>> "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this >>>>> message. >>>>> ps D 0 14018 1 0x00000004 >>>>> >>>>> >>>> (snip) >>>> >>>> >>>>> Zapping pages is the most time consuming part, according to the >>>>> suggestion from Michal Hock [1], zapping pages can be done with holding >>>>> read mmap_sem, like what MADV_DONTNEED does. Then re-acquire write >>>>> mmap_sem to manipulate vmas. >>>>> >>>> Does munmap() == MADV_DONTNEED + munmap() ? >>> Not exactly the same. So, I basically copied the page zapping used by munmap instead of calling MADV_DONTNEED. >>> >>>> For example, what happens with userfaultfd in this case? Can you get an >>>> extra #PF, which would be visible to userspace, before the munmap is >>>> finished? >>>> >>> userfaultfd is handled by regular munmap path. So, no change to userfaultfd part. >> Right. I see it now. >> >>>> In addition, would it be ok for the user to potentially get a zeroed page in >>>> the time window after the MADV_DONTNEED finished removing a PTE and before >>>> the munmap() is done? >>>> >>> This should be undefined behavior according to Michal. This has been discussed in https://lwn.net/Articles/753269/. >> Thanks for the reference. >> >> Reading the man page I see: "All pages containing a part of the indicated >> range are unmapped, and subsequent references to these pages will generate >> SIGSEGV.” > Yes, this is true but I guess what Yang Shi meant was that an userspace > access racing with munmap is not well defined. You never know whether > you get your data, #PTF or SEGV because it depends on timing. The user > visible change might be that you lose content and get zero page instead > if you hit the race window while we are unmapping which was not possible > before. But whouldn't such an access pattern be buggy anyway? You need > some form of external synchronization AFAICS. > > But maybe some userspace depends on "getting right data or get SEGV" > semantic. If we have to preserve that then we can come up with a VM_DEAD > flag set before we tear it down and force the SEGV on the #PF path. > Something similar we already do for MMF_UNSTABLE. By looking this deeper, we may not be able to cover all the unmapping range for VM_DEAD, for example, if the start addr is in the middle of a vma. We can't set VM_DEAD to that vma since that would trigger SIGSEGV for still mapped area. splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD to non-overlapped vmas. Access to overlapped vmas (first and last) will still have undefined behavior. Thanks, Yang
On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: > By looking this deeper, we may not be able to cover all the unmapping range > for VM_DEAD, for example, if the start addr is in the middle of a vma. We > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still > mapped area. > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD > to non-overlapped vmas. Access to overlapped vmas (first and last) will > still have undefined behavior. Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for writing, free everything left, drop mmap_sem. ? Sure, you acquire the lock 3 times, but both write instances should be 'short', and I suppose you can do a demote between 1 and 2 if you care.
On 6/26/18 12:43 AM, Peter Zijlstra wrote: > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >> By looking this deeper, we may not be able to cover all the unmapping range >> for VM_DEAD, for example, if the start addr is in the middle of a vma. We >> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still >> mapped area. >> >> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD >> to non-overlapped vmas. Access to overlapped vmas (first and last) will >> still have undefined behavior. > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for > writing, free everything left, drop mmap_sem. > > ? > > Sure, you acquire the lock 3 times, but both write instances should be > 'short', and I suppose you can do a demote between 1 and 2 if you care. Thanks, Peter. Yes, by looking the code and trying two different approaches, it looks this approach is the most straight-forward one. Splitting vma up-front can save a lot pain later. Holding write mmap_sem for this job before zapping mappings sounds worth the cost (very short write critical section). And, VM_DEAD can be set exclusively with write mmap_sem without racing with page faults, this will give us consistent behavior for the race between PF and munmap. And, we don't need care about overlapped vma since it has been split before. Yang
On Tue 26-06-18 18:03:34, Yang Shi wrote: > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote: > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: > > > By looking this deeper, we may not be able to cover all the unmapping range > > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We > > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still > > > mapped area. > > > > > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD > > > to non-overlapped vmas. Access to overlapped vmas (first and last) will > > > still have undefined behavior. > > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for > > writing, free everything left, drop mmap_sem. > > > > ? > > > > Sure, you acquire the lock 3 times, but both write instances should be > > 'short', and I suppose you can do a demote between 1 and 2 if you care. > > Thanks, Peter. Yes, by looking the code and trying two different approaches, > it looks this approach is the most straight-forward one. Yes, you just have to be careful about the max vma count limit.
On 6/27/18 12:24 AM, Michal Hocko wrote: > On Tue 26-06-18 18:03:34, Yang Shi wrote: >> >> On 6/26/18 12:43 AM, Peter Zijlstra wrote: >>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >>>> By looking this deeper, we may not be able to cover all the unmapping range >>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We >>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still >>>> mapped area. >>>> >>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD >>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will >>>> still have undefined behavior. >>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire >>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for >>> writing, free everything left, drop mmap_sem. >>> >>> ? >>> >>> Sure, you acquire the lock 3 times, but both write instances should be >>> 'short', and I suppose you can do a demote between 1 and 2 if you care. >> Thanks, Peter. Yes, by looking the code and trying two different approaches, >> it looks this approach is the most straight-forward one. > Yes, you just have to be careful about the max vma count limit. Yes, we should just need copy what do_munmap does as below: if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) return -ENOMEM; If the mas map count limit has been reached, it will return failure before zapping mappings. Thanks, Yang
On Wed 27-06-18 10:23:39, Yang Shi wrote: > > > On 6/27/18 12:24 AM, Michal Hocko wrote: > > On Tue 26-06-18 18:03:34, Yang Shi wrote: > > > > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote: > > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: > > > > > By looking this deeper, we may not be able to cover all the unmapping range > > > > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We > > > > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still > > > > > mapped area. > > > > > > > > > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD > > > > > to non-overlapped vmas. Access to overlapped vmas (first and last) will > > > > > still have undefined behavior. > > > > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire > > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for > > > > writing, free everything left, drop mmap_sem. > > > > > > > > ? > > > > > > > > Sure, you acquire the lock 3 times, but both write instances should be > > > > 'short', and I suppose you can do a demote between 1 and 2 if you care. > > > Thanks, Peter. Yes, by looking the code and trying two different approaches, > > > it looks this approach is the most straight-forward one. > > Yes, you just have to be careful about the max vma count limit. > > Yes, we should just need copy what do_munmap does as below: > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) > return -ENOMEM; > > If the mas map count limit has been reached, it will return failure before > zapping mappings. Yeah, but as soon as you drop the lock and retake it, somebody might have changed the adddress space and we might get inconsistency. So I am wondering whether we really need upgrade_read (to promote read to write lock) and do the down_write split & set up VM_DEAD downgrade_write unmap upgrade_read zap ptes up_write looks terrible, no question about that, but we won't drop the mmap sem at any time.
On 6/28/18 4:51 AM, Michal Hocko wrote: > On Wed 27-06-18 10:23:39, Yang Shi wrote: >> >> On 6/27/18 12:24 AM, Michal Hocko wrote: >>> On Tue 26-06-18 18:03:34, Yang Shi wrote: >>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote: >>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >>>>>> By looking this deeper, we may not be able to cover all the unmapping range >>>>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We >>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still >>>>>> mapped area. >>>>>> >>>>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD >>>>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will >>>>>> still have undefined behavior. >>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire >>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for >>>>> writing, free everything left, drop mmap_sem. >>>>> >>>>> ? >>>>> >>>>> Sure, you acquire the lock 3 times, but both write instances should be >>>>> 'short', and I suppose you can do a demote between 1 and 2 if you care. >>>> Thanks, Peter. Yes, by looking the code and trying two different approaches, >>>> it looks this approach is the most straight-forward one. >>> Yes, you just have to be careful about the max vma count limit. >> Yes, we should just need copy what do_munmap does as below: >> >> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) >> return -ENOMEM; >> >> If the mas map count limit has been reached, it will return failure before >> zapping mappings. > Yeah, but as soon as you drop the lock and retake it, somebody might > have changed the adddress space and we might get inconsistency. > > So I am wondering whether we really need upgrade_read (to promote read > to write lock) and do the > down_write > split & set up VM_DEAD > downgrade_write > unmap > upgrade_read > zap ptes > up_write I'm supposed address space changing just can be done by mmap, mremap, mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD flag is set for the vma, just return failure since it is being unmapped. Does it sounds reasonable? Thanks, Yang > > looks terrible, no question about that, but we won't drop the mmap sem > at any time.
On 6/28/18 12:10 PM, Yang Shi wrote: > > > On 6/28/18 4:51 AM, Michal Hocko wrote: >> On Wed 27-06-18 10:23:39, Yang Shi wrote: >>> >>> On 6/27/18 12:24 AM, Michal Hocko wrote: >>>> On Tue 26-06-18 18:03:34, Yang Shi wrote: >>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote: >>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >>>>>>> By looking this deeper, we may not be able to cover all the >>>>>>> unmapping range >>>>>>> for VM_DEAD, for example, if the start addr is in the middle of >>>>>>> a vma. We >>>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV >>>>>>> for still >>>>>>> mapped area. >>>>>>> >>>>>>> splitting can't be done with read mmap_sem held, so maybe just >>>>>>> set VM_DEAD >>>>>>> to non-overlapped vmas. Access to overlapped vmas (first and >>>>>>> last) will >>>>>>> still have undefined behavior. >>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. >>>>>> Acquire >>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for >>>>>> writing, free everything left, drop mmap_sem. >>>>>> >>>>>> ? >>>>>> >>>>>> Sure, you acquire the lock 3 times, but both write instances >>>>>> should be >>>>>> 'short', and I suppose you can do a demote between 1 and 2 if you >>>>>> care. >>>>> Thanks, Peter. Yes, by looking the code and trying two different >>>>> approaches, >>>>> it looks this approach is the most straight-forward one. >>>> Yes, you just have to be careful about the max vma count limit. >>> Yes, we should just need copy what do_munmap does as below: >>> >>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) >>> return -ENOMEM; >>> >>> If the mas map count limit has been reached, it will return failure >>> before >>> zapping mappings. >> Yeah, but as soon as you drop the lock and retake it, somebody might >> have changed the adddress space and we might get inconsistency. >> >> So I am wondering whether we really need upgrade_read (to promote read >> to write lock) and do the >> down_write >> split & set up VM_DEAD >> downgrade_write >> unmap >> upgrade_read >> zap ptes >> up_write Promoting to write lock may be a trouble. There might be other users in the critical section with read lock, we have to wait them to finish. > > I'm supposed address space changing just can be done by mmap, mremap, > mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD > flag is set for the vma, just return failure since it is being unmapped. > > Does it sounds reasonable? It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED (mremap), right? How about letting them return -EBUSY or -EAGAIN to notify the application? This changes the behavior a little bit, MAP_FIXED and mremap may fail if they fail the race with munmap (if the mapping is larger than 1GB). I'm not sure if any multi-threaded application uses MAP_FIXED and MREMAP_FIXED very heavily which may run into the race condition. I guess it should be rare to meet all the conditions to trigger the race. The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED since they may corrupt its own address space as the man page noted. Thanks, Yang > > Thanks, > Yang > >> >> looks terrible, no question about that, but we won't drop the mmap sem >> at any time. >
On Thu 28-06-18 12:10:10, Yang Shi wrote: > > > On 6/28/18 4:51 AM, Michal Hocko wrote: > > On Wed 27-06-18 10:23:39, Yang Shi wrote: > > > > > > On 6/27/18 12:24 AM, Michal Hocko wrote: > > > > On Tue 26-06-18 18:03:34, Yang Shi wrote: > > > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote: > > > > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: > > > > > > > By looking this deeper, we may not be able to cover all the unmapping range > > > > > > > for VM_DEAD, for example, if the start addr is in the middle of a vma. We > > > > > > > can't set VM_DEAD to that vma since that would trigger SIGSEGV for still > > > > > > > mapped area. > > > > > > > > > > > > > > splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD > > > > > > > to non-overlapped vmas. Access to overlapped vmas (first and last) will > > > > > > > still have undefined behavior. > > > > > > Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire > > > > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for > > > > > > writing, free everything left, drop mmap_sem. > > > > > > > > > > > > ? > > > > > > > > > > > > Sure, you acquire the lock 3 times, but both write instances should be > > > > > > 'short', and I suppose you can do a demote between 1 and 2 if you care. > > > > > Thanks, Peter. Yes, by looking the code and trying two different approaches, > > > > > it looks this approach is the most straight-forward one. > > > > Yes, you just have to be careful about the max vma count limit. > > > Yes, we should just need copy what do_munmap does as below: > > > > > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) > > > return -ENOMEM; > > > > > > If the mas map count limit has been reached, it will return failure before > > > zapping mappings. > > Yeah, but as soon as you drop the lock and retake it, somebody might > > have changed the adddress space and we might get inconsistency. > > > > So I am wondering whether we really need upgrade_read (to promote read > > to write lock) and do the > > down_write > > split & set up VM_DEAD > > downgrade_write > > unmap > > upgrade_read > > zap ptes > > up_write > > I'm supposed address space changing just can be done by mmap, mremap, > mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD flag is > set for the vma, just return failure since it is being unmapped. I am sorry I do not follow. How does VM_DEAD flag helps for a completely unrelated vmas? Or maybe it would be better to post the code to see what you mean exactly.
On Thu 28-06-18 17:59:25, Yang Shi wrote: > > > On 6/28/18 12:10 PM, Yang Shi wrote: > > > > > > On 6/28/18 4:51 AM, Michal Hocko wrote: > > > On Wed 27-06-18 10:23:39, Yang Shi wrote: > > > > > > > > On 6/27/18 12:24 AM, Michal Hocko wrote: > > > > > On Tue 26-06-18 18:03:34, Yang Shi wrote: > > > > > > On 6/26/18 12:43 AM, Peter Zijlstra wrote: > > > > > > > On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: > > > > > > > > By looking this deeper, we may not be able to > > > > > > > > cover all the unmapping range > > > > > > > > for VM_DEAD, for example, if the start addr is > > > > > > > > in the middle of a vma. We > > > > > > > > can't set VM_DEAD to that vma since that would > > > > > > > > trigger SIGSEGV for still > > > > > > > > mapped area. > > > > > > > > > > > > > > > > splitting can't be done with read mmap_sem held, > > > > > > > > so maybe just set VM_DEAD > > > > > > > > to non-overlapped vmas. Access to overlapped > > > > > > > > vmas (first and last) will > > > > > > > > still have undefined behavior. > > > > > > > Acquire mmap_sem for writing, split, mark VM_DEAD, > > > > > > > drop mmap_sem. Acquire > > > > > > > mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for > > > > > > > writing, free everything left, drop mmap_sem. > > > > > > > > > > > > > > ? > > > > > > > > > > > > > > Sure, you acquire the lock 3 times, but both write > > > > > > > instances should be > > > > > > > 'short', and I suppose you can do a demote between 1 > > > > > > > and 2 if you care. > > > > > > Thanks, Peter. Yes, by looking the code and trying two > > > > > > different approaches, > > > > > > it looks this approach is the most straight-forward one. > > > > > Yes, you just have to be careful about the max vma count limit. > > > > Yes, we should just need copy what do_munmap does as below: > > > > > > > > if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) > > > > return -ENOMEM; > > > > > > > > If the mas map count limit has been reached, it will return > > > > failure before > > > > zapping mappings. > > > Yeah, but as soon as you drop the lock and retake it, somebody might > > > have changed the adddress space and we might get inconsistency. > > > > > > So I am wondering whether we really need upgrade_read (to promote read > > > to write lock) and do the > > > down_write > > > split & set up VM_DEAD > > > downgrade_write > > > unmap > > > upgrade_read > > > zap ptes > > > up_write > > Promoting to write lock may be a trouble. There might be other users in the > critical section with read lock, we have to wait them to finish. Yes. Is that a problem though? > > I'm supposed address space changing just can be done by mmap, mremap, > > mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD > > flag is set for the vma, just return failure since it is being unmapped. > > > > Does it sounds reasonable? > > It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED (mremap), > right? > > How about letting them return -EBUSY or -EAGAIN to notify the application? Well, non of those is documented to return EBUSY and EAGAIN already has a meaning for locked memory. > This changes the behavior a little bit, MAP_FIXED and mremap may fail if > they fail the race with munmap (if the mapping is larger than 1GB). I'm not > sure if any multi-threaded application uses MAP_FIXED and MREMAP_FIXED very > heavily which may run into the race condition. I guess it should be rare to > meet all the conditions to trigger the race. > > The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED since > they may corrupt its own address space as the man page noted. Well, I suspect you are overcomplicating this a bit. This should be really straightforward thing - well except for VM_DEAD which is quite tricky already. We should rather not spread this trickyness outside of the #PF path. And I would even try hard to start that part simple to see whether it actually matters. Relying on races between threads without any locking is quite questionable already. Nobody has pointed to a sane usecase so far.
On 6/29/18 4:34 AM, Michal Hocko wrote: > On Thu 28-06-18 12:10:10, Yang Shi wrote: >> >> On 6/28/18 4:51 AM, Michal Hocko wrote: >>> On Wed 27-06-18 10:23:39, Yang Shi wrote: >>>> On 6/27/18 12:24 AM, Michal Hocko wrote: >>>>> On Tue 26-06-18 18:03:34, Yang Shi wrote: >>>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote: >>>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >>>>>>>> By looking this deeper, we may not be able to cover all the unmapping range >>>>>>>> for VM_DEAD, for example, if the start addr is in the middle of a vma. We >>>>>>>> can't set VM_DEAD to that vma since that would trigger SIGSEGV for still >>>>>>>> mapped area. >>>>>>>> >>>>>>>> splitting can't be done with read mmap_sem held, so maybe just set VM_DEAD >>>>>>>> to non-overlapped vmas. Access to overlapped vmas (first and last) will >>>>>>>> still have undefined behavior. >>>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, drop mmap_sem. Acquire >>>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for >>>>>>> writing, free everything left, drop mmap_sem. >>>>>>> >>>>>>> ? >>>>>>> >>>>>>> Sure, you acquire the lock 3 times, but both write instances should be >>>>>>> 'short', and I suppose you can do a demote between 1 and 2 if you care. >>>>>> Thanks, Peter. Yes, by looking the code and trying two different approaches, >>>>>> it looks this approach is the most straight-forward one. >>>>> Yes, you just have to be careful about the max vma count limit. >>>> Yes, we should just need copy what do_munmap does as below: >>>> >>>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) >>>> return -ENOMEM; >>>> >>>> If the mas map count limit has been reached, it will return failure before >>>> zapping mappings. >>> Yeah, but as soon as you drop the lock and retake it, somebody might >>> have changed the adddress space and we might get inconsistency. >>> >>> So I am wondering whether we really need upgrade_read (to promote read >>> to write lock) and do the >>> down_write >>> split & set up VM_DEAD >>> downgrade_write >>> unmap >>> upgrade_read >>> zap ptes >>> up_write >> I'm supposed address space changing just can be done by mmap, mremap, >> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD flag is >> set for the vma, just return failure since it is being unmapped. > I am sorry I do not follow. How does VM_DEAD flag helps for a completely > unrelated vmas? Or maybe it would be better to post the code to see what > you mean exactly. I mean we just care about the vmas which have been found/split by munmap, right? We already set VM_DEAD to them. Even though those other vmas are changed by somebody else, it would not cause any inconsistency to this munmap call.
On 6/29/18 4:39 AM, Michal Hocko wrote: > On Thu 28-06-18 17:59:25, Yang Shi wrote: >> >> On 6/28/18 12:10 PM, Yang Shi wrote: >>> >>> On 6/28/18 4:51 AM, Michal Hocko wrote: >>>> On Wed 27-06-18 10:23:39, Yang Shi wrote: >>>>> On 6/27/18 12:24 AM, Michal Hocko wrote: >>>>>> On Tue 26-06-18 18:03:34, Yang Shi wrote: >>>>>>> On 6/26/18 12:43 AM, Peter Zijlstra wrote: >>>>>>>> On Mon, Jun 25, 2018 at 05:06:23PM -0700, Yang Shi wrote: >>>>>>>>> By looking this deeper, we may not be able to >>>>>>>>> cover all the unmapping range >>>>>>>>> for VM_DEAD, for example, if the start addr is >>>>>>>>> in the middle of a vma. We >>>>>>>>> can't set VM_DEAD to that vma since that would >>>>>>>>> trigger SIGSEGV for still >>>>>>>>> mapped area. >>>>>>>>> >>>>>>>>> splitting can't be done with read mmap_sem held, >>>>>>>>> so maybe just set VM_DEAD >>>>>>>>> to non-overlapped vmas. Access to overlapped >>>>>>>>> vmas (first and last) will >>>>>>>>> still have undefined behavior. >>>>>>>> Acquire mmap_sem for writing, split, mark VM_DEAD, >>>>>>>> drop mmap_sem. Acquire >>>>>>>> mmap_sem for reading, madv_free drop mmap_sem. Acquire mmap_sem for >>>>>>>> writing, free everything left, drop mmap_sem. >>>>>>>> >>>>>>>> ? >>>>>>>> >>>>>>>> Sure, you acquire the lock 3 times, but both write >>>>>>>> instances should be >>>>>>>> 'short', and I suppose you can do a demote between 1 >>>>>>>> and 2 if you care. >>>>>>> Thanks, Peter. Yes, by looking the code and trying two >>>>>>> different approaches, >>>>>>> it looks this approach is the most straight-forward one. >>>>>> Yes, you just have to be careful about the max vma count limit. >>>>> Yes, we should just need copy what do_munmap does as below: >>>>> >>>>> if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) >>>>> return -ENOMEM; >>>>> >>>>> If the mas map count limit has been reached, it will return >>>>> failure before >>>>> zapping mappings. >>>> Yeah, but as soon as you drop the lock and retake it, somebody might >>>> have changed the adddress space and we might get inconsistency. >>>> >>>> So I am wondering whether we really need upgrade_read (to promote read >>>> to write lock) and do the >>>> down_write >>>> split & set up VM_DEAD >>>> downgrade_write >>>> unmap >>>> upgrade_read >>>> zap ptes >>>> up_write >> Promoting to write lock may be a trouble. There might be other users in the >> critical section with read lock, we have to wait them to finish. > Yes. Is that a problem though? Not a problem, but just not sure how complicated it would be. Considering all the lock debug/lockdep stuff. And, the behavior smells like rcu. > >>> I'm supposed address space changing just can be done by mmap, mremap, >>> mprotect. If so, we may utilize the new VM_DEAD flag. If the VM_DEAD >>> flag is set for the vma, just return failure since it is being unmapped. >>> >>> Does it sounds reasonable? >> It looks we just need care about MAP_FIXED (mmap) and MREMAP_FIXED (mremap), >> right? >> >> How about letting them return -EBUSY or -EAGAIN to notify the application? > Well, non of those is documented to return EBUSY and EAGAIN already has > a meaning for locked memory. > >> This changes the behavior a little bit, MAP_FIXED and mremap may fail if >> they fail the race with munmap (if the mapping is larger than 1GB). I'm not >> sure if any multi-threaded application uses MAP_FIXED and MREMAP_FIXED very >> heavily which may run into the race condition. I guess it should be rare to >> meet all the conditions to trigger the race. >> >> The programmer should be very cautious about MAP_FIXED.MREMAP_FIXED since >> they may corrupt its own address space as the man page noted. > Well, I suspect you are overcomplicating this a bit. This should be > really straightforward thing - well except for VM_DEAD which is quite > tricky already. We should rather not spread this trickyness outside of > the #PF path. And I would even try hard to start that part simple to see > whether it actually matters. Relying on races between threads without > any locking is quite questionable already. Nobody has pointed to a sane > usecase so far. I agree to keep it as simple as possible then see if it matters or not. So, in v3 I will just touch the page fault path.
diff --git a/mm/mmap.c b/mm/mmap.c index fc41c05..e84f80c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2686,6 +2686,141 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } +/* Consider PUD size or 1GB mapping as large mapping */ +#ifdef HPAGE_PUD_SIZE +#define LARGE_MAP_THRESH HPAGE_PUD_SIZE +#else +#define LARGE_MAP_THRESH (1 * 1024 * 1024 * 1024) +#endif + +/* Unmap large mapping early with acquiring read mmap_sem */ +static int do_munmap_zap_early(struct mm_struct *mm, unsigned long start, + size_t len, struct list_head *uf) +{ + unsigned long end = 0; + struct vm_area_struct *vma = NULL, *prev, *last, *tmp; + bool success = false; + int ret = 0; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE - start) + return -EINVAL; + + len = (PAGE_ALIGN(len)); + if (len == 0) + return -EINVAL; + + /* Just deal with uf in regular path */ + if (unlikely(uf)) + goto regular_path; + + if (len >= LARGE_MAP_THRESH) { + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); + if (!vma) { + up_read(&mm->mmap_sem); + return 0; + } + + prev = vma->vm_prev; + + end = start + len; + if (vma->vm_start > end) { + up_read(&mm->mmap_sem); + return 0; + } + + if (start > vma->vm_start) { + int error; + + if (end < vma->vm_end && + mm->map_count > sysctl_max_map_count) { + up_read(&mm->mmap_sem); + return -ENOMEM; + } + + error = __split_vma(mm, vma, start, 0); + if (error) { + up_read(&mm->mmap_sem); + return error; + } + prev = vma; + } + + last = find_vma(mm, end); + if (last && end > last->vm_start) { + int error = __split_vma(mm, last, end, 1); + + if (error) { + up_read(&mm->mmap_sem); + return error; + } + } + vma = prev ? prev->vm_next : mm->mmap; + + /* + * Unmapping vmas, which has VM_LOCKED|VM_HUGETLB|VM_PFNMAP + * flag set or has uprobes set, need acquire write map_sem, + * so skip them in early zap. Just deal with such mapping in + * regular path. + * Borrow can_madv_dontneed_vma() to check the conditions. + */ + tmp = vma; + while (tmp && tmp->vm_start < end) { + if (!can_madv_dontneed_vma(tmp) || + vma_has_uprobes(tmp, start, end)) + goto sem_drop; + tmp = tmp->vm_next; + } + + unmap_region(mm, vma, prev, start, end); + /* indicates early zap is success */ + success = true; + +sem_drop: + up_read(&mm->mmap_sem); + } + +regular_path: + /* hold write mmap_sem for vma manipulation or regular path */ + if (down_write_killable(&mm->mmap_sem)) + return -EINTR; + if (success) { + /* vmas have been zapped, here just deal with loose end */ + detach_vmas_to_be_unmapped(mm, vma, prev, end); + arch_unmap(mm, vma, start, end); + remove_vma_list(mm, vma); + } else { + /* vma is VM_LOCKED|VM_HUGETLB|VM_PFNMAP or has uprobe */ + if (vma) { + if (unlikely(uf)) { + int ret = userfaultfd_unmap_prep(vma, start, + end, uf); + if (ret) + goto out; + } + if (mm->locked_vm) { + tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + detach_vmas_to_be_unmapped(mm, vma, prev, end); + unmap_region(mm, vma, prev, start, end); + remove_vma_list(mm, vma); + } else + /* When mapping size < LARGE_MAP_THRESH */ + ret = do_munmap(mm, start, len, uf); + } + +out: + up_write(&mm->mmap_sem); + return ret; +} + /* Munmap is split into 2 main parts -- this part which finds * what needs doing, and the areas themselves, which do the * work. This now handles partial unmappings. @@ -2792,6 +2927,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return 0; } +static int vm_munmap_zap_early(unsigned long start, size_t len) +{ + int ret; + struct mm_struct *mm = current->mm; + LIST_HEAD(uf); + + ret = do_munmap_zap_early(mm, start, len, &uf); + userfaultfd_unmap_complete(mm, &uf); + return ret; +} + int vm_munmap(unsigned long start, size_t len) { int ret; @@ -2811,7 +2957,7 @@ int vm_munmap(unsigned long start, size_t len) SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { profile_munmap(addr); - return vm_munmap(addr, len); + return vm_munmap_zap_early(addr, len); }