diff mbox

[v5,1/8] mm: Place unscrubbed pages at the end of pagelist

Message ID 1498157830-21845-2-git-send-email-boris.ostrovsky@oracle.com (mailing list archive)
State New, archived
Headers show

Commit Message

Boris Ostrovsky June 22, 2017, 6:57 p.m. UTC
. so that it's easy to find pages that need to be scrubbed (those pages are
now marked with _PGC_need_scrub bit).

We keep track of the first unscrubbed page in a page buddy using first_dirty
field. For now it can have two values, 0 (whole buddy needs scrubbing) or
INVALID_DIRTY_IDX (the buddy does not need to be scrubbed). Subsequent patches
will allow scrubbing to be interrupted, resulting in first_dirty taking any
value.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
Changes in v5:
* Be more careful when returning unused portion of a buddy to the heap in
  alloc_heap_pages() and don't set first_dirty if we know that the sub-buddy
  is clean
* In reserve_offlined_page(), don't try to find dirty pages in sub-buddies if we
  can figure out that there is none.
* Drop unnecessary setting of first_dirty in free_heap_pages()
* Switch to using bitfields in page_info.u.free

I kept node_need_scrub[] as a global array and not a "per-node". I think splitting
it should be part of making heap_lock a per-node lock, together with increasing
scrub concurrency by having more than one CPU scrub a node.



 xen/common/page_alloc.c  | 190 +++++++++++++++++++++++++++++++++++++++--------
 xen/include/asm-arm/mm.h |  18 ++++-
 xen/include/asm-x86/mm.h |  17 ++++-
 3 files changed, 190 insertions(+), 35 deletions(-)

Comments

Jan Beulich June 27, 2017, 5:06 p.m. UTC | #1
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
> I kept node_need_scrub[] as a global array and not a "per-node". I think splitting
> it should be part of making heap_lock a per-node lock, together with increasing
> scrub concurrency by having more than one CPU scrub a node.

Agreed - I hadn't meant my earlier comment to necessarily affect this series.

> @@ -798,11 +814,26 @@ static struct page_info *alloc_heap_pages(
>      return NULL;
>  
>   found: 
> +
> +    if ( pg->u.free.first_dirty != INVALID_DIRTY_IDX )
> +        first_dirty_pg = pg + pg->u.free.first_dirty;
> +
>      /* We may have to halve the chunk a number of times. */
>      while ( j != order )
>      {
> -        PFN_ORDER(pg) = --j;
> -        page_list_add_tail(pg, &heap(node, zone, j));
> +        unsigned int first_dirty;
> +
> +        if ( first_dirty_pg && ((pg + (1 << j)) > first_dirty_pg) )

Despite the various examples of doing it this way, please at least use 1u.

> +        {
> +            if ( pg < first_dirty_pg )
> +                first_dirty = (first_dirty_pg - pg) / sizeof(*pg);

Pointer subtraction already includes the involved division. Otoh I wonder
if you couldn't get away without pointer comparison/subtraction here
altogether.

> @@ -849,13 +880,22 @@ static int reserve_offlined_page(struct page_info *head)
>  {
>      unsigned int node = phys_to_nid(page_to_maddr(head));
>      int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
> -    struct page_info *cur_head;
> +    struct page_info *cur_head, *first_dirty_pg = NULL;
>      int cur_order;
>  
>      ASSERT(spin_is_locked(&heap_lock));
>  
>      cur_head = head;
>  
> +    /*
> +     * We may break the buddy so let's mark the head as clean. Then, when
> +     * merging chunks back into the heap, we will see whether the chunk has
> +     * unscrubbed pages and set its first_dirty properly.
> +     */
> +    if (head->u.free.first_dirty != INVALID_DIRTY_IDX)

Coding style.

> @@ -892,8 +934,25 @@ static int reserve_offlined_page(struct page_info *head)
>              {
>              merge:
>                  /* We don't consider merging outside the head_order. */
> -                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
> -                PFN_ORDER(cur_head) = cur_order;
> +
> +                /* See if any of the pages indeed need scrubbing. */
> +                if ( first_dirty_pg && (cur_head + (1 << cur_order) > first_dirty_pg) )
> +                {
> +                    if ( cur_head < first_dirty_pg )
> +                        i = (first_dirty_pg - cur_head) / sizeof(*cur_head);
> +                    else
> +                        i = 0;
> +
> +                    for ( ; i < (1 << cur_order); i++ )
> +                        if ( test_bit(_PGC_need_scrub,
> +                                      &cur_head[i].count_info) )
> +                        {
> +                            first_dirty = i;
> +                            break;
> +                        }

Perhaps worth having ASSERT(first_dirty != INVALID_DIRTY_IDX) here? Or are
there cases where ->u.free.first_dirty of a page may be wrong?

> @@ -977,35 +1090,53 @@ static void free_heap_pages(
>  
>          if ( (page_to_mfn(pg) & mask) )
>          {
> +            struct page_info *predecessor = pg - mask;
> +
>              /* Merge with predecessor block? */
> -            if ( !mfn_valid(_mfn(page_to_mfn(pg-mask))) ||
> -                 !page_state_is(pg-mask, free) ||
> -                 (PFN_ORDER(pg-mask) != order) ||
> -                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
> +            if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
> +                 !page_state_is(predecessor, free) ||
> +                 (PFN_ORDER(predecessor) != order) ||
> +                 (phys_to_nid(page_to_maddr(predecessor)) != node) )
>                  break;
> -            pg -= mask;
> -            page_list_del(pg, &heap(node, zone, order));
> +
> +            page_list_del(predecessor, &heap(node, zone, order));
> +
> +            if ( predecessor->u.free.first_dirty != INVALID_DIRTY_IDX )
> +                need_scrub = true;

I'm afraid I continue to be confused by this: Why does need_scrub depend on
the state of pages not being the subject of the current free operation? I
realize that at this point in the series the path can't be taken yet, but
won't later patches need to rip it out or change it anyway, in which case it
would be better to introduce the then correct check (if any) only there?

> --- a/xen/include/asm-x86/mm.h
> +++ b/xen/include/asm-x86/mm.h
> @@ -88,7 +88,15 @@ struct page_info
>          /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>          struct {
>              /* Do TLBs need flushing for safety before next page use? */
> -            bool_t need_tlbflush;
> +            unsigned long need_tlbflush:1;
> +
> +            /*
> +             * Index of the first *possibly* unscrubbed page in the buddy.
> +             * One more than maximum possible order (MAX_ORDER+1) to

Why +1 here and hence ...

> +             * accommodate INVALID_DIRTY_IDX.
> +             */
> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
> +            unsigned long first_dirty:MAX_ORDER + 2;

... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
parenthesized (apart from lacking blanks around the shift operator)? I'd
expect you want a value with MAX_ORDER+1 set bits, i.e.
(1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.

Jan
Boris Ostrovsky July 23, 2017, 2 a.m. UTC | #2
On 06/27/2017 01:06 PM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
>> I kept node_need_scrub[] as a global array and not a "per-node". I think splitting
>> it should be part of making heap_lock a per-node lock, together with increasing
>> scrub concurrency by having more than one CPU scrub a node.
> 
> Agreed - I hadn't meant my earlier comment to necessarily affect this series.
> 
>> @@ -798,11 +814,26 @@ static struct page_info *alloc_heap_pages(
>>       return NULL;
>>   
>>    found:
>> +
>> +    if ( pg->u.free.first_dirty != INVALID_DIRTY_IDX )
>> +        first_dirty_pg = pg + pg->u.free.first_dirty;
>> +
>>       /* We may have to halve the chunk a number of times. */
>>       while ( j != order )
>>       {
>> -        PFN_ORDER(pg) = --j;
>> -        page_list_add_tail(pg, &heap(node, zone, j));
>> +        unsigned int first_dirty;
>> +
>> +        if ( first_dirty_pg && ((pg + (1 << j)) > first_dirty_pg) )
> 
> Despite the various examples of doing it this way, please at least use 1u.
> 
>> +        {
>> +            if ( pg < first_dirty_pg )
>> +                first_dirty = (first_dirty_pg - pg) / sizeof(*pg);
> 
> Pointer subtraction already includes the involved division. 


Yes, this was a mistake.

> Otoh I wonder
> if you couldn't get away without pointer comparison/subtraction here
> altogether.


Without comparison I can only assume that first_dirty is zero (i.e. the 
whole buddy is potentially dirty). Is there something else I could do?


> 
>> @@ -849,13 +880,22 @@ static int reserve_offlined_page(struct page_info *head)
>>   {
>>       unsigned int node = phys_to_nid(page_to_maddr(head));
>>       int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
>> -    struct page_info *cur_head;
>> +    struct page_info *cur_head, *first_dirty_pg = NULL;
>>       int cur_order;
>>   
>>       ASSERT(spin_is_locked(&heap_lock));
>>   
>>       cur_head = head;
>>   
>> +    /*
>> +     * We may break the buddy so let's mark the head as clean. Then, when
>> +     * merging chunks back into the heap, we will see whether the chunk has
>> +     * unscrubbed pages and set its first_dirty properly.
>> +     */
>> +    if (head->u.free.first_dirty != INVALID_DIRTY_IDX)
> 
> Coding style.
> 
>> @@ -892,8 +934,25 @@ static int reserve_offlined_page(struct page_info *head)
>>               {
>>               merge:
>>                   /* We don't consider merging outside the head_order. */
>> -                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
>> -                PFN_ORDER(cur_head) = cur_order;
>> +
>> +                /* See if any of the pages indeed need scrubbing. */
>> +                if ( first_dirty_pg && (cur_head + (1 << cur_order) > first_dirty_pg) )
>> +                {
>> +                    if ( cur_head < first_dirty_pg )
>> +                        i = (first_dirty_pg - cur_head) / sizeof(*cur_head);

I assume the same comment as above applies here.

>> +                    else
>> +                        i = 0;
>> +
>> +                    for ( ; i < (1 << cur_order); i++ )
>> +                        if ( test_bit(_PGC_need_scrub,
>> +                                      &cur_head[i].count_info) )
>> +                        {
>> +                            first_dirty = i;
>> +                            break;
>> +                        }
> 
> Perhaps worth having ASSERT(first_dirty != INVALID_DIRTY_IDX) here? Or are
> there cases where ->u.free.first_dirty of a page may be wrong?


When we merge in free_heap_pages we don't clear first_dirty of the 
successor buddy (at some point I did have this done but you questioned 
whether it was needed and I dropped it).


> 
>> @@ -977,35 +1090,53 @@ static void free_heap_pages(
>>   
>>           if ( (page_to_mfn(pg) & mask) )
>>           {
>> +            struct page_info *predecessor = pg - mask;
>> +
>>               /* Merge with predecessor block? */
>> -            if ( !mfn_valid(_mfn(page_to_mfn(pg-mask))) ||
>> -                 !page_state_is(pg-mask, free) ||
>> -                 (PFN_ORDER(pg-mask) != order) ||
>> -                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
>> +            if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
>> +                 !page_state_is(predecessor, free) ||
>> +                 (PFN_ORDER(predecessor) != order) ||
>> +                 (phys_to_nid(page_to_maddr(predecessor)) != node) )
>>                   break;
>> -            pg -= mask;
>> -            page_list_del(pg, &heap(node, zone, order));
>> +
>> +            page_list_del(predecessor, &heap(node, zone, order));
>> +
>> +            if ( predecessor->u.free.first_dirty != INVALID_DIRTY_IDX )
>> +                need_scrub = true;
> 
> I'm afraid I continue to be confused by this: Why does need_scrub depend on
> the state of pages not being the subject of the current free operation? I
> realize that at this point in the series the path can't be taken yet, but
> won't later patches need to rip it out or change it anyway, in which case it
> would be better to introduce the then correct check (if any) only there?


Right, at this point we indeed will never have the 'if' evaluate to true 
since heap is always clean. And when we switch to scrubbing from idle 
need_scrub is never looked at.

I suspect this check will not be needed in the intermediate patches neither.

> 
>> --- a/xen/include/asm-x86/mm.h
>> +++ b/xen/include/asm-x86/mm.h
>> @@ -88,7 +88,15 @@ struct page_info
>>           /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>>           struct {
>>               /* Do TLBs need flushing for safety before next page use? */
>> -            bool_t need_tlbflush;
>> +            unsigned long need_tlbflush:1;
>> +
>> +            /*
>> +             * Index of the first *possibly* unscrubbed page in the buddy.
>> +             * One more than maximum possible order (MAX_ORDER+1) to
> 
> Why +1 here and hence ...

Don't we have MAX_ORDER+1 orders?

> 
>> +             * accommodate INVALID_DIRTY_IDX.
>> +             */
>> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
>> +            unsigned long first_dirty:MAX_ORDER + 2;
> 
> ... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
> parenthesized (apart from lacking blanks around the shift operator)? I'd
> expect you want a value with MAX_ORDER+1 set bits, i.e.
> (1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.

Yes to parentheses and AND. Should be (1UL << (MAX_ORDER + 2)) - 1

-boris
Jan Beulich July 31, 2017, 2:45 p.m. UTC | #3
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:01 AM >>>
>On 06/27/2017 01:06 PM, Jan Beulich wrote:
>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
>>> +        {
>>> +            if ( pg < first_dirty_pg )
>>> +                first_dirty = (first_dirty_pg - pg) / sizeof(*pg);
>> 
>> Pointer subtraction already includes the involved division. 
>
>
>Yes, this was a mistake.
>
>> Otoh I wonder
>> if you couldn't get away without pointer comparison/subtraction here
>> altogether.
>
>
>Without comparison I can only assume that first_dirty is zero (i.e. the 
>whole buddy is potentially dirty). Is there something else I could do?

I was thinking of tracking indexes instead of pointers. But maybe that
would more hamper readability of the overall result than help it.
 
>>> @@ -892,8 +934,25 @@ static int reserve_offlined_page(struct page_info *head)
>>>               {
>>>               merge:
>>>                   /* We don't consider merging outside the head_order. */
>>> -                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
>>> -                PFN_ORDER(cur_head) = cur_order;
>>> +
>>> +                /* See if any of the pages indeed need scrubbing. */
>>> +                if ( first_dirty_pg && (cur_head + (1 << cur_order) > first_dirty_pg) )
>>> +                {
>>> +                    if ( cur_head < first_dirty_pg )
>>> +                        i = (first_dirty_pg - cur_head) / sizeof(*cur_head);
>
>I assume the same comment as above applies here.

Of course. I usually avoid repeating the same comment, except maybe
when reviewing patches of first time contributors.

>>> +                    else
>>> +                        i = 0;
>>> +
>>> +                    for ( ; i < (1 << cur_order); i++ )
>>> +                        if ( test_bit(_PGC_need_scrub,
>>> +                                      &cur_head[i].count_info) )
>>> +                        {
>>> +                            first_dirty = i;
>>> +                            break;
>>> +                        }
>> 
>> Perhaps worth having ASSERT(first_dirty != INVALID_DIRTY_IDX) here? Or are
>> there cases where ->u.free.first_dirty of a page may be wrong?
>
>
>When we merge in free_heap_pages we don't clear first_dirty of the 
>successor buddy (at some point I did have this done but you questioned 
>whether it was needed and I dropped it).

Hmm, this indeed answers my question, but doesn't help (me) understanding
whether the suggested ASSERT() could be wrong.

>>> --- a/xen/include/asm-x86/mm.h
>>> +++ b/xen/include/asm-x86/mm.h
>>> @@ -88,7 +88,15 @@ struct page_info
>>>           /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>>>           struct {
>>>               /* Do TLBs need flushing for safety before next page use? */
>>> -            bool_t need_tlbflush;
>>> +            unsigned long need_tlbflush:1;
>>> +
>>> +            /*
>>> +             * Index of the first *possibly* unscrubbed page in the buddy.
>>> +             * One more than maximum possible order (MAX_ORDER+1) to
>> 
>> Why +1 here and hence ...
>
>Don't we have MAX_ORDER+1 orders?

So here there might be a simple misunderstanding: I understand the
parenthesized MAX_ORDER+1 to represent "maximum possible
order", i.e. excluding the "one more than", not the least because of
the ...

>> +             * accommodate INVALID_DIRTY_IDX.
>> +             */
>> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
>> +            unsigned long first_dirty:MAX_ORDER + 2;

+2 here.

>> ... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
>> parenthesized (apart from lacking blanks around the shift operator)? I'd
>> expect you want a value with MAX_ORDER+1 set bits, i.e.
>> (1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.
>
>Yes to parentheses and AND. Should be (1UL << (MAX_ORDER + 2)) - 1

I.e. I would still expect it to be (1UL << (MAX_ORDER + 1)) - 1
here.

Jan
Boris Ostrovsky July 31, 2017, 4:03 p.m. UTC | #4
On 07/31/2017 10:45 AM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:01 AM >>>
>> On 06/27/2017 01:06 PM, Jan Beulich wrote:
>>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
>>>> +        {
>>>> +            if ( pg < first_dirty_pg )
>>>> +                first_dirty = (first_dirty_pg - pg) / sizeof(*pg);
>>> Pointer subtraction already includes the involved division. 
>>
>> Yes, this was a mistake.
>>
>>> Otoh I wonder
>>> if you couldn't get away without pointer comparison/subtraction here
>>> altogether.
>>
>> Without comparison I can only assume that first_dirty is zero (i.e. the 
>> whole buddy is potentially dirty). Is there something else I could do?
> I was thinking of tracking indexes instead of pointers. But maybe that
> would more hamper readability of the overall result than help it.

I'll try to see how it looks.

>  
>>>> +                    else
>>>> +                        i = 0;
>>>> +
>>>> +                    for ( ; i < (1 << cur_order); i++ )
>>>> +                        if ( test_bit(_PGC_need_scrub,
>>>> +                                      &cur_head[i].count_info) )
>>>> +                        {
>>>> +                            first_dirty = i;
>>>> +                            break;
>>>> +                        }
>>> Perhaps worth having ASSERT(first_dirty != INVALID_DIRTY_IDX) here? Or are
>>> there cases where ->u.free.first_dirty of a page may be wrong?
>>
>> When we merge in free_heap_pages we don't clear first_dirty of the 
>> successor buddy (at some point I did have this done but you questioned 
>> whether it was needed and I dropped it).
> Hmm, this indeed answers my question, but doesn't help (me) understanding
> whether the suggested ASSERT() could be wrong.

Oh, I see what you were asking --- ASSERT() *after* the loop, to make
sure we indeed found the first dirty page. Yes, I will add it.

>
>>>> --- a/xen/include/asm-x86/mm.h
>>>> +++ b/xen/include/asm-x86/mm.h
>>>> @@ -88,7 +88,15 @@ struct page_info
>>>>           /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>>>>           struct {
>>>>               /* Do TLBs need flushing for safety before next page use? */
>>>> -            bool_t need_tlbflush;
>>>> +            unsigned long need_tlbflush:1;
>>>> +
>>>> +            /*
>>>> +             * Index of the first *possibly* unscrubbed page in the buddy.
>>>> +             * One more than maximum possible order (MAX_ORDER+1) to
>>> Why +1 here and hence ...
>> Don't we have MAX_ORDER+1 orders?
> So here there might be a simple misunderstanding: I understand the
> parenthesized MAX_ORDER+1 to represent "maximum possible
> order", i.e. excluding the "one more than", not the least because of
> the ...
>
>>> +             * accommodate INVALID_DIRTY_IDX.
>>> +             */
>>> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
>>> +            unsigned long first_dirty:MAX_ORDER + 2;
> +2 here.
>
>>> ... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
>>> parenthesized (apart from lacking blanks around the shift operator)? I'd
>>> expect you want a value with MAX_ORDER+1 set bits, i.e.
>>> (1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.
>> Yes to parentheses and AND. Should be (1UL << (MAX_ORDER + 2)) - 1
> I.e. I would still expect it to be (1UL << (MAX_ORDER + 1)) - 1
> here.


Sorry, I still don't get it.

Say, MAX_ORDER is 1. Since this implies that indexes 0, 1, 2 and 3 are
all valid (because we can have up to 2^(MAX_ORDER+1) pages), don't we
need 3 bits to indicate an invalid index?

-boris
Jan Beulich Aug. 2, 2017, 9:24 a.m. UTC | #5
>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 6:03 PM >>>
On 07/31/2017 10:45 AM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:01 AM >>>
>> On 06/27/2017 01:06 PM, Jan Beulich wrote:
>>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
>>>>> --- a/xen/include/asm-x86/mm.h
>>>>> +++ b/xen/include/asm-x86/mm.h
>>>>> @@ -88,7 +88,15 @@ struct page_info
>>>>>           /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>>>>>           struct {
>>>>>               /* Do TLBs need flushing for safety before next page use? */
>>>>> -            bool_t need_tlbflush;
>>>>> +            unsigned long need_tlbflush:1;
>>>>> +
>>>>> +            /*
>>>>> +             * Index of the first *possibly* unscrubbed page in the buddy.
>>>>> +             * One more than maximum possible order (MAX_ORDER+1) to
>>>> Why +1 here and hence ...
>>> Don't we have MAX_ORDER+1 orders?
>> So here there might be a simple misunderstanding: I understand the
>> parenthesized MAX_ORDER+1 to represent "maximum possible
>> order", i.e. excluding the "one more than", not the least because of
>> the ...
>>
>>>> +             * accommodate INVALID_DIRTY_IDX.
>>>> +             */
>>>> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
>>>> +            unsigned long first_dirty:MAX_ORDER + 2;
>> +2 here.
>>
>>>> ... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
>>>> parenthesized (apart from lacking blanks around the shift operator)? I'd
>>>> expect you want a value with MAX_ORDER+1 set bits, i.e.
>>>> (1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.
>>> Yes to parentheses and AND. Should be (1UL << (MAX_ORDER + 2)) - 1
>> I.e. I would still expect it to be (1UL << (MAX_ORDER + 1)) - 1
>> here.
>
>
>Sorry, I still don't get it.
>
>Say, MAX_ORDER is 1. Since this implies that indexes 0, 1, 2 and 3 are
>all valid (because we can have up to 2^(MAX_ORDER+1) pages), don't we
>need 3 bits to indicate an invalid index?

Why 0, 1, 2, and 3? When MAX_ORDER is 1, we only have a single bit, i.e.
valid values 0 and 1 (plus one more for the invalid indicator), i.e. need 2 bits
for representation of all used values.

Jan
Boris Ostrovsky Aug. 2, 2017, 3:31 p.m. UTC | #6
On 08/02/2017 05:24 AM, Jan Beulich wrote:
>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/31/17 6:03 PM >>>
> On 07/31/2017 10:45 AM, Jan Beulich wrote:
>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 07/23/17 4:01 AM >>>
>>> On 06/27/2017 01:06 PM, Jan Beulich wrote:
>>>>>>> Boris Ostrovsky <boris.ostrovsky@oracle.com> 06/22/17 8:55 PM >>>
>>>>>> --- a/xen/include/asm-x86/mm.h
>>>>>> +++ b/xen/include/asm-x86/mm.h
>>>>>> @@ -88,7 +88,15 @@ struct page_info
>>>>>>           /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
>>>>>>           struct {
>>>>>>               /* Do TLBs need flushing for safety before next page use? */
>>>>>> -            bool_t need_tlbflush;
>>>>>> +            unsigned long need_tlbflush:1;
>>>>>> +
>>>>>> +            /*
>>>>>> +             * Index of the first *possibly* unscrubbed page in the buddy.
>>>>>> +             * One more than maximum possible order (MAX_ORDER+1) to
>>>>> Why +1 here and hence ...
>>>> Don't we have MAX_ORDER+1 orders?
>>> So here there might be a simple misunderstanding: I understand the
>>> parenthesized MAX_ORDER+1 to represent "maximum possible
>>> order", i.e. excluding the "one more than", not the least because of
>>> the ...
>>>
>>>>> +             * accommodate INVALID_DIRTY_IDX.
>>>>> +             */
>>>>> +#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
>>>>> +            unsigned long first_dirty:MAX_ORDER + 2;
>>> +2 here.
>>>
>>>>> ... why +2 instead of +1? And isn't the expression INVALID_DIRTY_IDX wrongly
>>>>> parenthesized (apart from lacking blanks around the shift operator)? I'd
>>>>> expect you want a value with MAX_ORDER+1 set bits, i.e.
>>>>> (1UL << (MAX_ORDER + 1)) - 1. ANDing with -1UL seems quite pointless too.
>>>> Yes to parentheses and AND. Should be (1UL << (MAX_ORDER + 2)) - 1
>>> I.e. I would still expect it to be (1UL << (MAX_ORDER + 1)) - 1
>>> here.
>>
>> Sorry, I still don't get it.
>>
>> Say, MAX_ORDER is 1. Since this implies that indexes 0, 1, 2 and 3 are
>> all valid (because we can have up to 2^(MAX_ORDER+1) pages), don't we
>> need 3 bits to indicate an invalid index?
> Why 0, 1, 2, and 3? 

Of course, it's 0 and 1 only. MAX_ORDER+1 completely threw me off.

-boris

> When MAX_ORDER is 1, we only have a single bit, i.e.
> valid values 0 and 1 (plus one more for the invalid indicator), i.e. need 2 bits
> for representation of all used values.
>
> Jan
>
diff mbox

Patch

diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c
index 8bcef6a..570d1f7 100644
--- a/xen/common/page_alloc.c
+++ b/xen/common/page_alloc.c
@@ -383,6 +383,8 @@  typedef struct page_list_head heap_by_zone_and_order_t[NR_ZONES][MAX_ORDER+1];
 static heap_by_zone_and_order_t *_heap[MAX_NUMNODES];
 #define heap(node, zone, order) ((*_heap[node])[zone][order])
 
+static unsigned long node_need_scrub[MAX_NUMNODES];
+
 static unsigned long *avail[MAX_NUMNODES];
 static long total_avail_pages;
 
@@ -678,6 +680,20 @@  static void check_low_mem_virq(void)
     }
 }
 
+/* Pages that need a scrub are added to tail, otherwise to head. */
+static void page_list_add_scrub(struct page_info *pg, unsigned int node,
+                                unsigned int zone, unsigned int order,
+                                unsigned int first_dirty)
+{
+    PFN_ORDER(pg) = order;
+    pg->u.free.first_dirty = first_dirty;
+
+    if ( first_dirty != INVALID_DIRTY_IDX )
+        page_list_add_tail(pg, &heap(node, zone, order));
+    else
+        page_list_add(pg, &heap(node, zone, order));
+}
+
 /* Allocate 2^@order contiguous pages. */
 static struct page_info *alloc_heap_pages(
     unsigned int zone_lo, unsigned int zone_hi,
@@ -687,7 +703,7 @@  static struct page_info *alloc_heap_pages(
     unsigned int i, j, zone = 0, nodemask_retry = 0;
     nodeid_t first_node, node = MEMF_get_node(memflags), req_node = node;
     unsigned long request = 1UL << order;
-    struct page_info *pg;
+    struct page_info *pg, *first_dirty_pg = NULL;
     nodemask_t nodemask = (d != NULL ) ? d->node_affinity : node_online_map;
     bool_t need_tlbflush = 0;
     uint32_t tlbflush_timestamp = 0;
@@ -798,11 +814,26 @@  static struct page_info *alloc_heap_pages(
     return NULL;
 
  found: 
+
+    if ( pg->u.free.first_dirty != INVALID_DIRTY_IDX )
+        first_dirty_pg = pg + pg->u.free.first_dirty;
+
     /* We may have to halve the chunk a number of times. */
     while ( j != order )
     {
-        PFN_ORDER(pg) = --j;
-        page_list_add_tail(pg, &heap(node, zone, j));
+        unsigned int first_dirty;
+
+        if ( first_dirty_pg && ((pg + (1 << j)) > first_dirty_pg) )
+        {
+            if ( pg < first_dirty_pg )
+                first_dirty = (first_dirty_pg - pg) / sizeof(*pg);
+            else
+                first_dirty = 0;
+        }
+        else
+            first_dirty = INVALID_DIRTY_IDX;
+
+        page_list_add_scrub(pg, node, zone, --j, first_dirty);
         pg += 1 << j;
     }
 
@@ -849,13 +880,22 @@  static int reserve_offlined_page(struct page_info *head)
 {
     unsigned int node = phys_to_nid(page_to_maddr(head));
     int zone = page_to_zone(head), i, head_order = PFN_ORDER(head), count = 0;
-    struct page_info *cur_head;
+    struct page_info *cur_head, *first_dirty_pg = NULL;
     int cur_order;
 
     ASSERT(spin_is_locked(&heap_lock));
 
     cur_head = head;
 
+    /*
+     * We may break the buddy so let's mark the head as clean. Then, when
+     * merging chunks back into the heap, we will see whether the chunk has
+     * unscrubbed pages and set its first_dirty properly.
+     */
+    if (head->u.free.first_dirty != INVALID_DIRTY_IDX)
+        first_dirty_pg = head + head->u.free.first_dirty;
+    head->u.free.first_dirty = INVALID_DIRTY_IDX;
+
     page_list_del(head, &heap(node, zone, head_order));
 
     while ( cur_head < (head + (1 << head_order)) )
@@ -873,6 +913,8 @@  static int reserve_offlined_page(struct page_info *head)
 
         while ( cur_order < head_order )
         {
+            unsigned int first_dirty = INVALID_DIRTY_IDX;
+
             next_order = cur_order + 1;
 
             if ( (cur_head + (1 << next_order)) >= (head + ( 1 << head_order)) )
@@ -892,8 +934,25 @@  static int reserve_offlined_page(struct page_info *head)
             {
             merge:
                 /* We don't consider merging outside the head_order. */
-                page_list_add_tail(cur_head, &heap(node, zone, cur_order));
-                PFN_ORDER(cur_head) = cur_order;
+
+                /* See if any of the pages indeed need scrubbing. */
+                if ( first_dirty_pg && (cur_head + (1 << cur_order) > first_dirty_pg) )
+                {
+                    if ( cur_head < first_dirty_pg )
+                        i = (first_dirty_pg - cur_head) / sizeof(*cur_head);
+                    else
+                        i = 0;
+
+                    for ( ; i < (1 << cur_order); i++ )
+                        if ( test_bit(_PGC_need_scrub,
+                                      &cur_head[i].count_info) )
+                        {
+                            first_dirty = i;
+                            break;
+                        }
+                }
+                page_list_add_scrub(cur_head, node, zone,
+                                    cur_order, first_dirty);
                 cur_head += (1 << cur_order);
                 break;
             }
@@ -919,9 +978,53 @@  static int reserve_offlined_page(struct page_info *head)
     return count;
 }
 
+static void scrub_free_pages(unsigned int node)
+{
+    struct page_info *pg;
+    unsigned int zone;
+
+    ASSERT(spin_is_locked(&heap_lock));
+
+    if ( !node_need_scrub[node] )
+        return;
+
+    for ( zone = 0; zone < NR_ZONES; zone++ )
+    {
+        unsigned int order = MAX_ORDER;
+
+        do {
+            while ( !page_list_empty(&heap(node, zone, order)) )
+            {
+                unsigned int i;
+
+                /* Unscrubbed pages are always at the end of the list. */
+                pg = page_list_last(&heap(node, zone, order));
+                if ( pg->u.free.first_dirty == INVALID_DIRTY_IDX )
+                    break;
+
+                for ( i = pg->u.free.first_dirty; i < (1U << order); i++)
+                {
+                    if ( test_bit(_PGC_need_scrub, &pg[i].count_info) )
+                    {
+                        scrub_one_page(&pg[i]);
+                        pg[i].count_info &= ~PGC_need_scrub;
+                        node_need_scrub[node]--;
+                    }
+                }
+
+                page_list_del(pg, &heap(node, zone, order));
+                page_list_add_scrub(pg, node, zone, order, INVALID_DIRTY_IDX);
+
+                if ( node_need_scrub[node] == 0 )
+                    return;
+            }
+        } while ( order-- != 0 );
+    }
+}
+
 /* Free 2^@order set of pages. */
 static void free_heap_pages(
-    struct page_info *pg, unsigned int order)
+    struct page_info *pg, unsigned int order, bool need_scrub)
 {
     unsigned long mask, mfn = page_to_mfn(pg);
     unsigned int i, node = phys_to_nid(page_to_maddr(pg)), tainted = 0;
@@ -961,10 +1064,20 @@  static void free_heap_pages(
         /* This page is not a guest frame any more. */
         page_set_owner(&pg[i], NULL); /* set_gpfn_from_mfn snoops pg owner */
         set_gpfn_from_mfn(mfn + i, INVALID_M2P_ENTRY);
+
+        if ( need_scrub )
+            pg[i].count_info |= PGC_need_scrub;
     }
 
     avail[node][zone] += 1 << order;
     total_avail_pages += 1 << order;
+    if ( need_scrub )
+    {
+        node_need_scrub[node] += 1 << order;
+        pg->u.free.first_dirty = 0;
+    }
+    else
+        pg->u.free.first_dirty = INVALID_DIRTY_IDX;
 
     if ( tmem_enabled() )
         midsize_alloc_zone_pages = max(
@@ -977,35 +1090,53 @@  static void free_heap_pages(
 
         if ( (page_to_mfn(pg) & mask) )
         {
+            struct page_info *predecessor = pg - mask;
+
             /* Merge with predecessor block? */
-            if ( !mfn_valid(_mfn(page_to_mfn(pg-mask))) ||
-                 !page_state_is(pg-mask, free) ||
-                 (PFN_ORDER(pg-mask) != order) ||
-                 (phys_to_nid(page_to_maddr(pg-mask)) != node) )
+            if ( !mfn_valid(_mfn(page_to_mfn(predecessor))) ||
+                 !page_state_is(predecessor, free) ||
+                 (PFN_ORDER(predecessor) != order) ||
+                 (phys_to_nid(page_to_maddr(predecessor)) != node) )
                 break;
-            pg -= mask;
-            page_list_del(pg, &heap(node, zone, order));
+
+            page_list_del(predecessor, &heap(node, zone, order));
+
+            if ( predecessor->u.free.first_dirty != INVALID_DIRTY_IDX )
+                need_scrub = true;
+                /* ... and keep predecessor's first_dirty. */
+            else if ( pg->u.free.first_dirty != INVALID_DIRTY_IDX )
+                predecessor->u.free.first_dirty = (1U << order) +
+                                                  pg->u.free.first_dirty;
+
+            pg = predecessor;
         }
         else
         {
+            struct page_info *successor = pg + mask;
+
             /* Merge with successor block? */
-            if ( !mfn_valid(_mfn(page_to_mfn(pg+mask))) ||
-                 !page_state_is(pg+mask, free) ||
-                 (PFN_ORDER(pg+mask) != order) ||
-                 (phys_to_nid(page_to_maddr(pg+mask)) != node) )
+            if ( !mfn_valid(_mfn(page_to_mfn(successor))) ||
+                 !page_state_is(successor, free) ||
+                 (PFN_ORDER(successor) != order) ||
+                 (phys_to_nid(page_to_maddr(successor)) != node) )
                 break;
-            page_list_del(pg + mask, &heap(node, zone, order));
+            page_list_del(successor, &heap(node, zone, order));
+
+            if ( successor->u.free.first_dirty != INVALID_DIRTY_IDX )
+                need_scrub = true;
         }
 
         order++;
     }
 
-    PFN_ORDER(pg) = order;
-    page_list_add_tail(pg, &heap(node, zone, order));
+    page_list_add_scrub(pg, node, zone, order, pg->u.free.first_dirty);
 
     if ( tainted )
         reserve_offlined_page(pg);
 
+    if ( need_scrub )
+        scrub_free_pages(node);
+
     spin_unlock(&heap_lock);
 }
 
@@ -1226,7 +1357,7 @@  unsigned int online_page(unsigned long mfn, uint32_t *status)
     spin_unlock(&heap_lock);
 
     if ( (y & PGC_state) == PGC_state_offlined )
-        free_heap_pages(pg, 0);
+        free_heap_pages(pg, 0, false);
 
     return ret;
 }
@@ -1295,7 +1426,7 @@  static void init_heap_pages(
             nr_pages -= n;
         }
 
-        free_heap_pages(pg+i, 0);
+        free_heap_pages(pg + i, 0, false);
     }
 }
 
@@ -1622,7 +1753,7 @@  void free_xenheap_pages(void *v, unsigned int order)
 
     memguard_guard_range(v, 1 << (order + PAGE_SHIFT));
 
-    free_heap_pages(virt_to_page(v), order);
+    free_heap_pages(virt_to_page(v), order, false);
 }
 
 #else
@@ -1676,12 +1807,9 @@  void free_xenheap_pages(void *v, unsigned int order)
     pg = virt_to_page(v);
 
     for ( i = 0; i < (1u << order); i++ )
-    {
-        scrub_one_page(&pg[i]);
         pg[i].count_info &= ~PGC_xen_heap;
-    }
 
-    free_heap_pages(pg, order);
+    free_heap_pages(pg, order, true);
 }
 
 #endif
@@ -1790,7 +1918,7 @@  struct page_info *alloc_domheap_pages(
     if ( d && !(memflags & MEMF_no_owner) &&
          assign_pages(d, pg, order, memflags) )
     {
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, false);
         return NULL;
     }
     
@@ -1858,11 +1986,7 @@  void free_domheap_pages(struct page_info *pg, unsigned int order)
             scrub = 1;
         }
 
-        if ( unlikely(scrub) )
-            for ( i = 0; i < (1 << order); i++ )
-                scrub_one_page(&pg[i]);
-
-        free_heap_pages(pg, order);
+        free_heap_pages(pg, order, scrub);
     }
 
     if ( drop_dom_ref )
diff --git a/xen/include/asm-arm/mm.h b/xen/include/asm-arm/mm.h
index 13c673a..889a85e 100644
--- a/xen/include/asm-arm/mm.h
+++ b/xen/include/asm-arm/mm.h
@@ -44,7 +44,16 @@  struct page_info
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
             /* Do TLBs need flushing for safety before next page use? */
-            bool_t need_tlbflush;
+            unsigned long need_tlbflush:1;
+
+            /*
+             * Index of the first *possibly* unscrubbed page in the buddy.
+             * One more than maximum possible order (MAX_ORDER+1) to
+             * accommodate INVALID_DIRTY_IDX.
+             */
+#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
+            unsigned long first_dirty:MAX_ORDER + 2;
+
         } free;
 
     } u;
@@ -107,6 +116,13 @@  struct page_info
 #define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
+/*
+ * Page needs to be scrubbed. Since this bit can only be set on a page that is
+ * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
+ */
+#define _PGC_need_scrub   _PGC_allocated
+#define PGC_need_scrub    PGC_allocated
+
 extern unsigned long xenheap_mfn_start, xenheap_mfn_end;
 extern vaddr_t xenheap_virt_end;
 #ifdef CONFIG_ARM_64
diff --git a/xen/include/asm-x86/mm.h b/xen/include/asm-x86/mm.h
index d33599d..cd00bef 100644
--- a/xen/include/asm-x86/mm.h
+++ b/xen/include/asm-x86/mm.h
@@ -88,7 +88,15 @@  struct page_info
         /* Page is on a free list: ((count_info & PGC_count_mask) == 0). */
         struct {
             /* Do TLBs need flushing for safety before next page use? */
-            bool_t need_tlbflush;
+            unsigned long need_tlbflush:1;
+
+            /*
+             * Index of the first *possibly* unscrubbed page in the buddy.
+             * One more than maximum possible order (MAX_ORDER+1) to
+             * accommodate INVALID_DIRTY_IDX.
+             */
+#define INVALID_DIRTY_IDX (-1UL & (((1UL<<MAX_ORDER) + 2) - 1))
+            unsigned long first_dirty:MAX_ORDER + 2;
         } free;
 
     } u;
@@ -233,6 +241,13 @@  struct page_info
 #define PGC_count_width   PG_shift(9)
 #define PGC_count_mask    ((1UL<<PGC_count_width)-1)
 
+/*
+ * Page needs to be scrubbed. Since this bit can only be set on a page that is
+ * free (i.e. in PGC_state_free) we can reuse PGC_allocated bit.
+ */
+#define _PGC_need_scrub   _PGC_allocated
+#define PGC_need_scrub    PGC_allocated
+
 struct spage_info
 {
        unsigned long type_info;