diff mbox

[5/5] mm: Don't hold heap lock in alloc_heap_pages() longer than necessary

Message ID 1504026557-11365-6-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky Aug. 29, 2017, 5:09 p.m. UTC
Once pages are removed from the heap we don't need to hold the heap
lock. It is especially useful to drop it for an unscrubbed buddy since
we will be scrubbing it.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
---
Changes in v2:
* Moved spin_unlock() after d->last_alloc_node assignment

 xen/common/page_alloc.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

Comments

Wei Liu Aug. 30, 2017, 8:43 a.m. UTC | #1
On Tue, Aug 29, 2017 at 01:09:17PM -0400, Boris Ostrovsky wrote:
> Once pages are removed from the heap we don't need to hold the heap
> lock. It is especially useful to drop it for an unscrubbed buddy since
> we will be scrubbing it.
> 
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>

Reviewed-by: Wei Liu <wei.liu2@citrix.com>
Andrew Cooper Aug. 30, 2017, 9:27 a.m. UTC | #2
On 29/08/17 18:09, Boris Ostrovsky wrote:
> Once pages are removed from the heap we don't need to hold the heap
> lock. It is especially useful to drop it for an unscrubbed buddy since
> we will be scrubbing it.
>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> Reviewed-by: Jan Beulich <jbeulich@suse.com>
> ---
> Changes in v2:
> * Moved spin_unlock() after d->last_alloc_node assignment
>
>  xen/common/page_alloc.c | 7 ++++++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
>
> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
> index 6c08983..8df92aa 100644
> --- a/xen/common/page_alloc.c
> +++ b/xen/common/page_alloc.c
> @@ -860,6 +860,7 @@ static struct page_info *alloc_heap_pages(
>      struct page_info *pg;
>      bool need_tlbflush = false;
>      uint32_t tlbflush_timestamp = 0;
> +    unsigned int dirty_cnt = 0;
>  
>      /* Make sure there are enough bits in memflags for nodeID. */
>      BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
> @@ -948,6 +949,8 @@ static struct page_info *alloc_heap_pages(
>      if ( d != NULL )
>          d->last_alloc_node = node;
>  
> +    spin_unlock(&heap_lock);
> +
>      for ( i = 0; i < (1 << order); i++ )
>      {
>          /* Reference count must continuously be zero for free pages. */
> @@ -957,7 +960,7 @@ static struct page_info *alloc_heap_pages(
>          {
>              if ( !(memflags & MEMF_no_scrub) )
>                  scrub_one_page(&pg[i]);
> -            node_need_scrub[node]--;
> +            dirty_cnt++;
>          }
>  
>          pg[i].count_info = PGC_state_inuse;
> @@ -979,6 +982,8 @@ static struct page_info *alloc_heap_pages(
>              check_one_page(&pg[i]);
>      }
>  
> +    spin_lock(&heap_lock);
> +    node_need_scrub[node] -= dirty_cnt;
>      spin_unlock(&heap_lock);
>  
>      if ( need_tlbflush )

This patch has been applied to staging, but its got problems.  The
following crash is rather trivial to provoke:

~Andrew

(d19) Test result: SUCCESS
(XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----

(XEN) CPU:    5

(XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1

(XEN) RFLAGS: 0000000000010286   CONTEXT: hypervisor (d0v1)

(XEN) rax: 0000000000001b50   rbx: ffff82e00dd164a0   rcx: ffff82ffffffffe0

(XEN) rdx: ffff82ffffffffe0   rsi: ffff82d08056f600   rdi: 00000000ffffffff

(XEN) rbp: ffff83083751fda8   rsp: ffff83083751fd48   r8:  00000000000001b5

(XEN) r9:  0000000000000017   r10: 0000000000000017   r11: 0000000000000246

(XEN) r12: 0000000000000000   r13: 000000000000019e   r14: 0000000000000000

(XEN) r15: 0000000000000000   cr0: 0000000080050033   cr4: 00000000001526e0

(XEN) cr3: 0000000772d3f000   cr2: ffff82ffffffffe4

(XEN) ds: 0000   es: 0000   fs: 0000   gs: 0000   ss: e010   cs: e008

(XEN) Xen code around <ffff82d0802252fc> (page_alloc.c#free_heap_pages+0x786/0x7a1):

(XEN)  24 89 01 e9 91 fd ff ff <89> 7a 04 8b 03 89 01 e9 4d ff ff ff 48 83 c4 38

(XEN) Xen stack trace from rsp=ffff83083751fd48:

(XEN)    0000000000000001 0000001700000001 0000000000000000 0000000000000017

(XEN)    ffff82e00dd16480 0000000000000000 ffff82e00dd115e0 0000000000000000

(XEN)    ffff82e00dd16480 0000000000000000 ffff8306ec55f000 ffffea0002049040

(XEN)    ffff83083751fdf8 ffff82d080226785 ffff82d08023afa5 0000000000000203

(XEN)    ffff8306ec55f000 ffff8306ec55f330 ffff8306ec55f4d8 ffff8306ec55faa8

(XEN)    ffff8306ec55f000 ffffea0002049040 ffff83083751fe18 ffff82d0802f1e04

(XEN)    ffff8306ec55f000 ffff8306ec55f000 ffff83083751fe48 ffff82d0802e0f95

(XEN)    ffff8306ec55f000 00000000ffffffff ffff8306ec55faa8 ffff8306ec55f000

(XEN)    ffff83083751fe68 ffff82d080271bca ffff8306ec55faa8 ffff8300abdd5000

(XEN)    ffff83083751fe98 ffff82d080207d74 ffff830837516040 0000000000000000

(XEN)    0000000000000000 ffff83083751ffff ffff83083751fec8 ffff82d080229a3c

(XEN)    ffff82d080572d80 ffff82d080573000 ffff82d080572d80 ffffffffffffffff

(XEN)    ffff83083751fef8 ffff82d08023a68a ffff8300abfa7000 ffff88010490ede8

(XEN)    00000007b7595025 ffff8800848ba9a0 ffff83083751ff08 ffff82d08023a6df

(XEN)    00007cf7c8ae00c7 ffff82d08035f3a1 ffffea0002049040 ffff8800848ba9a0

(XEN)    00000007b7595025 ffff88010490ede8 ffff88008491bd78 0000000000000de8

(XEN)    0000000000000246 deadbeefdeadf00d 0000000000000000 0000000000000000

(XEN)    0000000000000000 ffffffff8100102a deadbeefdeadf00d deadbeefdeadf00d

(XEN)    deadbeefdeadf00d 0000010000000000 ffffffff8100102a 000000000000e033

(XEN)    0000000000000246 ffff88008491bd28 000000000000e02b c2c2c2c2c2c2beef

(XEN) Xen call trace:

(XEN)    [<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1

(XEN)    [<ffff82d080226785>] free_domheap_pages+0x312/0x37c

(XEN)    [<ffff82d0802f1e04>] stdvga_deinit+0x30/0x46

(XEN)    [<ffff82d0802e0f95>] hvm_domain_destroy+0x60/0x116

(XEN)    [<ffff82d080271bca>] arch_domain_destroy+0x1a/0x8f

(XEN)    [<ffff82d080207d74>] domain.c#complete_domain_destroy+0x6f/0x182

(XEN)    [<ffff82d080229a3c>] rcupdate.c#rcu_process_callbacks+0x141/0x1a2

(XEN)    [<ffff82d08023a68a>] softirq.c#__do_softirq+0x7f/0x8a

(XEN)    [<ffff82d08023a6df>] do_softirq+0x13/0x15

(XEN)    [<ffff82d08035f3a1>] x86_64/entry.S#process_softirqs+0x21/0x30

(XEN) 

(XEN) Pagetable walk from ffff82ffffffffe4:

(XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff

(XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff

(XEN) 

(XEN) ****************************************

(XEN) Panic on CPU 5:

(XEN) FATAL PAGE FAULT

(XEN) [error_code=0002]

(XEN) Faulting linear address: ffff82ffffffffe4

(XEN) ****************************************

(XEN) 

(XEN) Reboot in five seconds...

(XEN) Executing kexec image on cpu5

(XEN) Shot down all CPUs

I'm in purgatory
Jan Beulich Aug. 30, 2017, 10:22 a.m. UTC | #3
>>> On 30.08.17 at 11:27, <andrew.cooper3@citrix.com> wrote:
> On 29/08/17 18:09, Boris Ostrovsky wrote:
>> Once pages are removed from the heap we don't need to hold the heap
>> lock. It is especially useful to drop it for an unscrubbed buddy since
>> we will be scrubbing it.
>>
>> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
>> Reviewed-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> Changes in v2:
>> * Moved spin_unlock() after d->last_alloc_node assignment
>>
>>  xen/common/page_alloc.c | 7 ++++++-
>>  1 file changed, 6 insertions(+), 1 deletion(-)
>>
>> diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
>> index 6c08983..8df92aa 100644
>> --- a/xen/common/page_alloc.c
>> +++ b/xen/common/page_alloc.c
>> @@ -860,6 +860,7 @@ static struct page_info *alloc_heap_pages(
>>      struct page_info *pg;
>>      bool need_tlbflush = false;
>>      uint32_t tlbflush_timestamp = 0;
>> +    unsigned int dirty_cnt = 0;
>>  
>>      /* Make sure there are enough bits in memflags for nodeID. */
>>      BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
>> @@ -948,6 +949,8 @@ static struct page_info *alloc_heap_pages(
>>      if ( d != NULL )
>>          d->last_alloc_node = node;
>>  
>> +    spin_unlock(&heap_lock);
>> +
>>      for ( i = 0; i < (1 << order); i++ )
>>      {
>>          /* Reference count must continuously be zero for free pages. */
>> @@ -957,7 +960,7 @@ static struct page_info *alloc_heap_pages(
>>          {
>>              if ( !(memflags & MEMF_no_scrub) )
>>                  scrub_one_page(&pg[i]);
>> -            node_need_scrub[node]--;
>> +            dirty_cnt++;
>>          }
>>  
>>          pg[i].count_info = PGC_state_inuse;
>> @@ -979,6 +982,8 @@ static struct page_info *alloc_heap_pages(
>>              check_one_page(&pg[i]);
>>      }
>>  
>> +    spin_lock(&heap_lock);
>> +    node_need_scrub[node] -= dirty_cnt;
>>      spin_unlock(&heap_lock);
>>  
>>      if ( need_tlbflush )
> 
> This patch has been applied to staging, but its got problems.  The
> following crash is rather trivial to provoke:
> 
> ~Andrew
> 
> (d19) Test result: SUCCESS
> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
> (XEN) CPU:    5
> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
>...
> (XEN) Pagetable walk from ffff82ffffffffe4:
> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff

Some negative offset into somewhere, it seems. Upon second
look I think the patch is simply wrong in its current shape:
free_heap_pages() looks for page_state_is(..., free) when
trying to merge chunks, while alloc_heap_pages() now sets
PGC_state_inuse outside of the locked area. I'll revert it right
away.

Jan
Boris Ostrovsky Aug. 30, 2017, 12:59 p.m. UTC | #4
>> This patch has been applied to staging, but its got problems.  The
>> following crash is rather trivial to provoke:
>>
>> ~Andrew
>>
>> (d19) Test result: SUCCESS
>> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
>> (XEN) CPU:    5
>> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
>> ...
>> (XEN) Pagetable walk from ffff82ffffffffe4:
>> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
>> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff
> Some negative offset into somewhere, it seems. Upon second
> look I think the patch is simply wrong in its current shape:
> free_heap_pages() looks for page_state_is(..., free) when
> trying to merge chunks, while alloc_heap_pages() now sets
> PGC_state_inuse outside of the locked area. I'll revert it right
> away.

Yes, so we do need to update page state under heap lock. I'll then move
scrubbing (and checking) only to outside the lock.

I am curious though, what was the test to trigger this? I ran about 100
parallel reboots under memory pressure and never hit this.


-boris
Wei Liu Aug. 30, 2017, 1:01 p.m. UTC | #5
On Wed, Aug 30, 2017 at 08:59:34AM -0400, Boris Ostrovsky wrote:
> 
> >> This patch has been applied to staging, but its got problems.  The
> >> following crash is rather trivial to provoke:
> >>
> >> ~Andrew
> >>
> >> (d19) Test result: SUCCESS
> >> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
> >> (XEN) CPU:    5
> >> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
> >> ...
> >> (XEN) Pagetable walk from ffff82ffffffffe4:
> >> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
> >> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff
> > Some negative offset into somewhere, it seems. Upon second
> > look I think the patch is simply wrong in its current shape:
> > free_heap_pages() looks for page_state_is(..., free) when
> > trying to merge chunks, while alloc_heap_pages() now sets
> > PGC_state_inuse outside of the locked area. I'll revert it right
> > away.
> 
> Yes, so we do need to update page state under heap lock. I'll then move
> scrubbing (and checking) only to outside the lock.
> 
> I am curious though, what was the test to trigger this? I ran about 100
> parallel reboots under memory pressure and never hit this.
> 

Appears to be one of the tests in xtf.git
Andrew Cooper Aug. 30, 2017, 1:06 p.m. UTC | #6
On 30/08/17 13:59, Boris Ostrovsky wrote:
>>> This patch has been applied to staging, but its got problems.  The
>>> following crash is rather trivial to provoke:
>>>
>>> ~Andrew
>>>
>>> (d19) Test result: SUCCESS
>>> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
>>> (XEN) CPU:    5
>>> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
>>> ...
>>> (XEN) Pagetable walk from ffff82ffffffffe4:
>>> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
>>> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff
>> Some negative offset into somewhere, it seems. Upon second
>> look I think the patch is simply wrong in its current shape:
>> free_heap_pages() looks for page_state_is(..., free) when
>> trying to merge chunks, while alloc_heap_pages() now sets
>> PGC_state_inuse outside of the locked area. I'll revert it right
>> away.
> Yes, so we do need to update page state under heap lock. I'll then move
> scrubbing (and checking) only to outside the lock.
>
> I am curious though, what was the test to trigger this? I ran about 100
> parallel reboots under memory pressure and never hit this.

# git clone git://xenbits.xen.org/xtf.git
# cd xtf
# make -j4 -s
# ./xtf-runner -qa

Purposefully, ./xtf-runner doesn't synchronously wait for VMs to be
fully destroyed before starting the next test.  (There is an ~800ms
added delay to synchronously destroy HVM guests, over PV, which I expect
is down to an interaction with qemu.  I got sufficiently annoyed that I
coded around the issue.)

As a result, destruction of one domain will be happening while
construction of the next one is happening.

~Andrew
Jan Beulich Aug. 30, 2017, 1:30 p.m. UTC | #7
>>> On 30.08.17 at 14:59, <boris.ostrovsky@oracle.com> wrote:

>>> This patch has been applied to staging, but its got problems.  The
>>> following crash is rather trivial to provoke:
>>>
>>> ~Andrew
>>>
>>> (d19) Test result: SUCCESS
>>> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
>>> (XEN) CPU:    5
>>> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
>>> ...
>>> (XEN) Pagetable walk from ffff82ffffffffe4:
>>> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
>>> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff
>> Some negative offset into somewhere, it seems. Upon second
>> look I think the patch is simply wrong in its current shape:
>> free_heap_pages() looks for page_state_is(..., free) when
>> trying to merge chunks, while alloc_heap_pages() now sets
>> PGC_state_inuse outside of the locked area. I'll revert it right
>> away.
> 
> Yes, so we do need to update page state under heap lock. I'll then move
> scrubbing (and checking) only to outside the lock.

Actually I think you only need to set the first 4k page's state
with the lock still held.

Jan
Boris Ostrovsky Aug. 30, 2017, 1:59 p.m. UTC | #8
On 08/30/2017 09:06 AM, Andrew Cooper wrote:
> On 30/08/17 13:59, Boris Ostrovsky wrote:
>>>> This patch has been applied to staging, but its got problems.  The
>>>> following crash is rather trivial to provoke:
>>>>
>>>> ~Andrew
>>>>
>>>> (d19) Test result: SUCCESS
>>>> (XEN) ----[ Xen-4.10-unstable  x86_64  debug=y   Tainted:    H ]----
>>>> (XEN) CPU:    5
>>>> (XEN) RIP:    e008:[<ffff82d0802252fc>] page_alloc.c#free_heap_pages+0x786/0x7a1
>>>> ...
>>>> (XEN) Pagetable walk from ffff82ffffffffe4:
>>>> (XEN)  L4[0x105] = 00000000abe5b063 ffffffffffffffff
>>>> (XEN)  L3[0x1ff] = 0000000000000000 ffffffffffffffff
>>> Some negative offset into somewhere, it seems. Upon second
>>> look I think the patch is simply wrong in its current shape:
>>> free_heap_pages() looks for page_state_is(..., free) when
>>> trying to merge chunks, while alloc_heap_pages() now sets
>>> PGC_state_inuse outside of the locked area. I'll revert it right
>>> away.
>> Yes, so we do need to update page state under heap lock. I'll then move
>> scrubbing (and checking) only to outside the lock.
>>
>> I am curious though, what was the test to trigger this? I ran about 100
>> parallel reboots under memory pressure and never hit this.
> # git clone git://xenbits.xen.org/xtf.git
> # cd xtf
> # make -j4 -s
> # ./xtf-runner -qa
>
> Purposefully, ./xtf-runner doesn't synchronously wait for VMs to be
> fully destroyed before starting the next test.  (There is an ~800ms
> added delay to synchronously destroy HVM guests, over PV, which I expect
> is down to an interaction with qemu.  I got sufficiently annoyed that I
> coded around the issue.)
>
> As a result, destruction of one domain will be happening while
> construction of the next one is happening.

I was also doing overlapped destruction/construction but at random (so
overlaps didn't happen all the time).

xtf-runner indeed tripped this panic fairly quickly.

-boris
diff mbox

Patch

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 6c08983..8df92aa 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -860,6 +860,7 @@  static struct page_info *alloc_heap_pages(
     struct page_info *pg;
     bool need_tlbflush = false;
     uint32_t tlbflush_timestamp = 0;
+    unsigned int dirty_cnt = 0;
 
     /* Make sure there are enough bits in memflags for nodeID. */
     BUILD_BUG_ON((_MEMF_bits - _MEMF_node) < (8 * sizeof(nodeid_t)));
@@ -948,6 +949,8 @@  static struct page_info *alloc_heap_pages(
     if ( d != NULL )
         d->last_alloc_node = node;
 
+    spin_unlock(&heap_lock);
+
     for ( i = 0; i < (1 << order); i++ )
     {
         /* Reference count must continuously be zero for free pages. */
@@ -957,7 +960,7 @@  static struct page_info *alloc_heap_pages(
         {
             if ( !(memflags & MEMF_no_scrub) )
                 scrub_one_page(&pg[i]);
-            node_need_scrub[node]--;
+            dirty_cnt++;
         }
 
         pg[i].count_info = PGC_state_inuse;
@@ -979,6 +982,8 @@  static struct page_info *alloc_heap_pages(
             check_one_page(&pg[i]);
     }
 
+    spin_lock(&heap_lock);
+    node_need_scrub[node] -= dirty_cnt;
     spin_unlock(&heap_lock);
 
     if ( need_tlbflush )