diff mbox

[v5,4/4] x86/ioreq server: Reset outstanding p2m_ioreq_server entries when an ioreq server unmaps.

Message ID 1468314129-28465-5-git-send-email-yu.c.zhang@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yu Zhang July 12, 2016, 9:02 a.m. UTC
This patch resets p2m_ioreq_server entries back to p2m_ram_rw,
after an ioreq server has unmapped. The resync is done both
asynchronously with the current p2m_change_entry_type_global()
interface, and synchronously by iterating the p2m table. The
synchronous resetting is necessary because we need to guarantee
the p2m table is clean before another ioreq server is mapped.
And since the sweeping of p2m table could be time consuming,
it is done with hypercall continuation. Asynchronous approach
is also taken so that p2m_ioreq_server entries can also be reset
when the HVM is scheduled between hypercall continuations.

This patch also disallows live migration, when there's still any
outstanding p2m_ioreq_server entry left. The core reason is our
current implementation of p2m_change_entry_type_global() can not
tell the state of p2m_ioreq_server entries(can not decide if an
entry is to be emulated or to be resynced).

Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
---
Cc: Paul Durrant <paul.durrant@citrix.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: George Dunlap <george.dunlap@eu.citrix.com>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: Kevin Tian <kevin.tian@intel.com>
---
 xen/arch/x86/hvm/hvm.c    | 52 ++++++++++++++++++++++++++++++++++++++++++++---
 xen/arch/x86/mm/hap/hap.c |  9 ++++++++
 xen/arch/x86/mm/p2m-ept.c |  6 +++++-
 xen/arch/x86/mm/p2m-pt.c  | 10 +++++++--
 xen/arch/x86/mm/p2m.c     |  3 +++
 xen/include/asm-x86/p2m.h |  5 ++++-
 6 files changed, 78 insertions(+), 7 deletions(-)

Comments

Jan Beulich Aug. 8, 2016, 4:29 p.m. UTC | #1
>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>          if ( rc )
>              goto out;
>  
> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
> +            p2m->ioreq.entry_count++;
> +
> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
> +            p2m->ioreq.entry_count--;
> +

These (and others below) happen, afaict, outside of any lock, which
can't be right.

> @@ -5530,11 +5537,13 @@ static int hvmop_set_mem_type(
>  }
>  
>  static int hvmop_map_mem_type_to_ioreq_server(
> -    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
> +    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
> +    unsigned long *iter)
>  {
>      xen_hvm_map_mem_type_to_ioreq_server_t op;
>      struct domain *d;
>      int rc;
> +    unsigned long gfn = *iter;
>  
>      if ( copy_from_guest(&op, uop, 1) )
>          return -EFAULT;
> @@ -5559,7 +5568,42 @@ static int hvmop_map_mem_type_to_ioreq_server(
>      if ( rc != 0 )
>          goto out;
>  
> -    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
> +    if ( gfn == 0 || op.flags != 0 )
> +        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);

Couldn't you omit the right side of the || in the if()?

> +    /*
> +     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
> +     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
> +     */
> +    if ( op.flags == 0 && rc == 0 )
> +    {
> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +        while ( gfn <= p2m->max_mapped_pfn )
> +        {
> +            p2m_type_t t;
> +
> +            if ( p2m->ioreq.entry_count == 0 )
> +                break;

Perhaps better to be moved up into the while() expression?

> +            get_gfn_unshare(d, gfn, &t);
> +
> +            if ( (t == p2m_ioreq_server) &&
> +                 !(p2m_change_type_one(d, gfn, t, p2m_ram_rw)) )

Stray parentheses.

> +                p2m->ioreq.entry_count--;
> +
> +            put_gfn(d, gfn);
> +
> +            /* Check for continuation if it's not the last iteration. */
> +            if ( ++gfn <= p2m->max_mapped_pfn &&
> +                 !(gfn & HVMOP_op_mask) &&
> +                 hypercall_preempt_check() )
> +            {
> +                rc = -ERESTART;
> +                goto out;

I think you need to write gfn back to *iter.

Jan
Yu Zhang Aug. 9, 2016, 7:39 a.m. UTC | #2
On 8/9/2016 12:29 AM, Jan Beulich wrote:
>>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>           if ( rc )
>>               goto out;
>>   
>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>> +            p2m->ioreq.entry_count++;
>> +
>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>> +            p2m->ioreq.entry_count--;
>> +
> These (and others below) happen, afaict, outside of any lock, which
> can't be right.

How about we make this entry_count as atomic_t and use atomic_inc/dec 
instead?

>
>> @@ -5530,11 +5537,13 @@ static int hvmop_set_mem_type(
>>   }
>>   
>>   static int hvmop_map_mem_type_to_ioreq_server(
>> -    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
>> +    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
>> +    unsigned long *iter)
>>   {
>>       xen_hvm_map_mem_type_to_ioreq_server_t op;
>>       struct domain *d;
>>       int rc;
>> +    unsigned long gfn = *iter;
>>   
>>       if ( copy_from_guest(&op, uop, 1) )
>>           return -EFAULT;
>> @@ -5559,7 +5568,42 @@ static int hvmop_map_mem_type_to_ioreq_server(
>>       if ( rc != 0 )
>>           goto out;
>>   
>> -    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
>> +    if ( gfn == 0 || op.flags != 0 )
>> +        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
> Couldn't you omit the right side of the || in the if()?

Oh right, it is redundant. Thanks! :-)

>
>> +    /*
>> +     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
>> +     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
>> +     */
>> +    if ( op.flags == 0 && rc == 0 )
>> +    {
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>> +
>> +        while ( gfn <= p2m->max_mapped_pfn )
>> +        {
>> +            p2m_type_t t;
>> +
>> +            if ( p2m->ioreq.entry_count == 0 )
>> +                break;
> Perhaps better to be moved up into the while() expression?

OK. Another thing is maybe we can use _atomic_read() to get value of 
entry_count if
the counter is defined as atomic_t?

>> +            get_gfn_unshare(d, gfn, &t);
>> +
>> +            if ( (t == p2m_ioreq_server) &&
>> +                 !(p2m_change_type_one(d, gfn, t, p2m_ram_rw)) )
> Stray parentheses.

Got it. Thanks!

>> +                p2m->ioreq.entry_count--;
>> +
>> +            put_gfn(d, gfn);
>> +
>> +            /* Check for continuation if it's not the last iteration. */
>> +            if ( ++gfn <= p2m->max_mapped_pfn &&
>> +                 !(gfn & HVMOP_op_mask) &&
>> +                 hypercall_preempt_check() )
>> +            {
>> +                rc = -ERESTART;
>> +                goto out;
> I think you need to write gfn back to *iter.

Yes, thanks for pointing out.

Yu
Jan Beulich Aug. 9, 2016, 8:13 a.m. UTC | #3
>>> On 09.08.16 at 09:39, <yu.c.zhang@linux.intel.com> wrote:

> 
> On 8/9/2016 12:29 AM, Jan Beulich wrote:
>>>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
>>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>>           if ( rc )
>>>               goto out;
>>>   
>>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>>> +            p2m->ioreq.entry_count++;
>>> +
>>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>>> +            p2m->ioreq.entry_count--;
>>> +
>> These (and others below) happen, afaict, outside of any lock, which
>> can't be right.
> 
> How about we make this entry_count as atomic_t and use atomic_inc/dec 
> instead?

That's certainly one of the options, but beware of overflow.

>>> +    /*
>>> +     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
>>> +     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
>>> +     */
>>> +    if ( op.flags == 0 && rc == 0 )
>>> +    {
>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>> +
>>> +        while ( gfn <= p2m->max_mapped_pfn )
>>> +        {
>>> +            p2m_type_t t;
>>> +
>>> +            if ( p2m->ioreq.entry_count == 0 )
>>> +                break;
>> Perhaps better to be moved up into the while() expression?
> 
> OK. Another thing is maybe we can use _atomic_read() to get value of 
> entry_count if
> the counter is defined as atomic_t?

Well, that's an orthogonal adjustment which solely depends on the
type of the field.

Jan
Yu Zhang Aug. 9, 2016, 9:25 a.m. UTC | #4
On 8/9/2016 4:13 PM, Jan Beulich wrote:
>>>> On 09.08.16 at 09:39, <yu.c.zhang@linux.intel.com> wrote:
>> On 8/9/2016 12:29 AM, Jan Beulich wrote:
>>>>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
>>>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>>>            if ( rc )
>>>>                goto out;
>>>>    
>>>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>>>> +            p2m->ioreq.entry_count++;
>>>> +
>>>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>>>> +            p2m->ioreq.entry_count--;
>>>> +
>>> These (and others below) happen, afaict, outside of any lock, which
>>> can't be right.
>> How about we make this entry_count as atomic_t and use atomic_inc/dec
>> instead?
> That's certainly one of the options, but beware of overflow.

Oh, thanks for your remind, Jan. I just found that atomic_t is defined 
as  "typedef struct { int counter; } atomic_t;"
which do have overflow issues if entry_count is supposed to be a uint32 
or uint64.

Now I have another thought: the entry_count is referenced in 3 
different  scenarios:
1> the hvmop handlers -  hvmop_set_mem_type() and 
hvmop_map_mem_type_to_ioreq_server(),
which  are triggered by device model, and will not be concurrent. And 
hvmop_set_mem_type() will
permit the mem type change only when an ioreq server has already been 
mapped to this type.

2> the misconfiguration handlers - resolve_misconfig() or do_recalc(),  
which are triggered by HVM's
vm exit, yet this will only happen after the ioreq server has already 
been unmapped. This means the
accesses to the entry_count will not be concurrent with the above hvmop 
handlers.

3> routine hap_enable_log_dirty() - this can be triggered by tool stack 
at any time, but only by read
access to this entry_count, so a read_atomic() would work.

Do you think this analysis reasonable?
If so, we may only use read_atomic() to get the value of entry_count in 
hap_enable_log_dirty() and keep
other places as it is.

Of course, some explanation comments in entry_count's definition would 
be necessary. :-)

Yu
Jan Beulich Aug. 9, 2016, 9:45 a.m. UTC | #5
>>> On 09.08.16 at 11:25, <yu.c.zhang@linux.intel.com> wrote:

> 
> On 8/9/2016 4:13 PM, Jan Beulich wrote:
>>>>> On 09.08.16 at 09:39, <yu.c.zhang@linux.intel.com> wrote:
>>> On 8/9/2016 12:29 AM, Jan Beulich wrote:
>>>>>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
>>>>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>>>>            if ( rc )
>>>>>                goto out;
>>>>>    
>>>>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>>>>> +            p2m->ioreq.entry_count++;
>>>>> +
>>>>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>>>>> +            p2m->ioreq.entry_count--;
>>>>> +
>>>> These (and others below) happen, afaict, outside of any lock, which
>>>> can't be right.
>>> How about we make this entry_count as atomic_t and use atomic_inc/dec
>>> instead?
>> That's certainly one of the options, but beware of overflow.
> 
> Oh, thanks for your remind, Jan. I just found that atomic_t is defined 
> as  "typedef struct { int counter; } atomic_t;"
> which do have overflow issues if entry_count is supposed to be a uint32 
> or uint64.
> 
> Now I have another thought: the entry_count is referenced in 3 
> different  scenarios:
> 1> the hvmop handlers -  hvmop_set_mem_type() and 
> hvmop_map_mem_type_to_ioreq_server(),
> which  are triggered by device model, and will not be concurrent. And 
> hvmop_set_mem_type() will
> permit the mem type change only when an ioreq server has already been 
> mapped to this type.

You shouldn't rely on a well behaved dm, and that's already
leaving aside the question whether there couldn't even be well
behaved use cases of parallel invocations of this op.

> 2> the misconfiguration handlers - resolve_misconfig() or do_recalc(),  
> which are triggered by HVM's
> vm exit, yet this will only happen after the ioreq server has already 
> been unmapped. This means the
> accesses to the entry_count will not be concurrent with the above hvmop 
> handlers.

This case may be fine, but not for (just) the named reason:
Multiple misconfig invocations can happen at the same time, but
presumably your addition is inside the p2m-locked region (you'd
have to double check that).

> 3> routine hap_enable_log_dirty() - this can be triggered by tool stack 
> at any time, but only by read
> access to this entry_count, so a read_atomic() would work.

A read access may be fine, but only if the value can't get incremented
in a racy way.

Jan
Yu Zhang Aug. 9, 2016, 10:21 a.m. UTC | #6
On 8/9/2016 5:45 PM, Jan Beulich wrote:
>>>> On 09.08.16 at 11:25, <yu.c.zhang@linux.intel.com> wrote:
>> On 8/9/2016 4:13 PM, Jan Beulich wrote:
>>>>>> On 09.08.16 at 09:39, <yu.c.zhang@linux.intel.com> wrote:
>>>> On 8/9/2016 12:29 AM, Jan Beulich wrote:
>>>>>>>> On 12.07.16 at 11:02, <yu.c.zhang@linux.intel.com> wrote:
>>>>>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>>>>>             if ( rc )
>>>>>>                 goto out;
>>>>>>     
>>>>>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>>>>>> +            p2m->ioreq.entry_count++;
>>>>>> +
>>>>>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>>>>>> +            p2m->ioreq.entry_count--;
>>>>>> +
>>>>> These (and others below) happen, afaict, outside of any lock, which
>>>>> can't be right.
>>>> How about we make this entry_count as atomic_t and use atomic_inc/dec
>>>> instead?
>>> That's certainly one of the options, but beware of overflow.
>> Oh, thanks for your remind, Jan. I just found that atomic_t is defined
>> as  "typedef struct { int counter; } atomic_t;"
>> which do have overflow issues if entry_count is supposed to be a uint32
>> or uint64.
>>
>> Now I have another thought: the entry_count is referenced in 3
>> different  scenarios:
>> 1> the hvmop handlers -  hvmop_set_mem_type() and
>> hvmop_map_mem_type_to_ioreq_server(),
>> which  are triggered by device model, and will not be concurrent. And
>> hvmop_set_mem_type() will
>> permit the mem type change only when an ioreq server has already been
>> mapped to this type.
> You shouldn't rely on a well behaved dm, and that's already
> leaving aside the question whether there couldn't even be well
> behaved use cases of parallel invocations of this op.

Oh. This is a good point. Thanks. :-)

>> 2> the misconfiguration handlers - resolve_misconfig() or do_recalc(),
>> which are triggered by HVM's
>> vm exit, yet this will only happen after the ioreq server has already
>> been unmapped. This means the
>> accesses to the entry_count will not be concurrent with the above hvmop
>> handlers.
> This case may be fine, but not for (just) the named reason:
> Multiple misconfig invocations can happen at the same time, but
> presumably your addition is inside the p2m-locked region (you'd
> have to double check that).

Yes, both are wrapped in p2m-locked region. So this give me another thought:
if we put the increment/decrement of entry_count inside 
p2m_change_type_one(),
which has the p2m_lock/p2m_unlock, we could avoid introducing another 
spinlock
and can also avoid the overflow possibility by using atomic_t.

[snip]

Thanks
Yu
George Dunlap Aug. 16, 2016, 1:35 p.m. UTC | #7
On 12/07/16 10:02, Yu Zhang wrote:
> This patch resets p2m_ioreq_server entries back to p2m_ram_rw,
> after an ioreq server has unmapped. The resync is done both
> asynchronously with the current p2m_change_entry_type_global()
> interface, and synchronously by iterating the p2m table. The
> synchronous resetting is necessary because we need to guarantee
> the p2m table is clean before another ioreq server is mapped.
> And since the sweeping of p2m table could be time consuming,
> it is done with hypercall continuation. Asynchronous approach
> is also taken so that p2m_ioreq_server entries can also be reset
> when the HVM is scheduled between hypercall continuations.
> 
> This patch also disallows live migration, when there's still any
> outstanding p2m_ioreq_server entry left. The core reason is our
> current implementation of p2m_change_entry_type_global() can not
> tell the state of p2m_ioreq_server entries(can not decide if an
> entry is to be emulated or to be resynced).
> 
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>

Thanks for doing this Yu Zhang!  A couple of comments.

> ---
> Cc: Paul Durrant <paul.durrant@citrix.com>
> Cc: Jan Beulich <jbeulich@suse.com>
> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
> Cc: George Dunlap <george.dunlap@eu.citrix.com>
> Cc: Jun Nakajima <jun.nakajima@intel.com>
> Cc: Kevin Tian <kevin.tian@intel.com>
> ---
>  xen/arch/x86/hvm/hvm.c    | 52 ++++++++++++++++++++++++++++++++++++++++++++---
>  xen/arch/x86/mm/hap/hap.c |  9 ++++++++
>  xen/arch/x86/mm/p2m-ept.c |  6 +++++-
>  xen/arch/x86/mm/p2m-pt.c  | 10 +++++++--
>  xen/arch/x86/mm/p2m.c     |  3 +++
>  xen/include/asm-x86/p2m.h |  5 ++++-
>  6 files changed, 78 insertions(+), 7 deletions(-)
> 
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 4d98cc6..e57c8b9 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -5485,6 +5485,7 @@ static int hvmop_set_mem_type(
>      {
>          unsigned long pfn = a.first_pfn + start_iter;
>          p2m_type_t t;
> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>  
>          get_gfn_unshare(d, pfn, &t);
>          if ( p2m_is_paging(t) )
> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>          if ( rc )
>              goto out;
>  
> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
> +            p2m->ioreq.entry_count++;
> +
> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
> +            p2m->ioreq.entry_count--;

Changing these here might make sense if they were *only* changed in the
hvm code; but as you also have to modify this value in the p2m code (in
resolve_misconfig), I think it makes sense to try to do all the counting
in the p2m code.  That would take care of any locking issues as well.

Logically the most sensible place to do it would be
atomic_write_ept_entry(); that would make it basically impossible to
miss a case where we change to of from p2m_ioreq_server.

On the other hand, it would mean adding code to a core path for all p2m
updates.

> +
>          /* Check for continuation if it's not the last interation */
>          if ( a.nr > ++start_iter && !(start_iter & HVMOP_op_mask) &&
>               hypercall_preempt_check() )
> @@ -5530,11 +5537,13 @@ static int hvmop_set_mem_type(
>  }
>  
>  static int hvmop_map_mem_type_to_ioreq_server(
> -    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
> +    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
> +    unsigned long *iter)
>  {
>      xen_hvm_map_mem_type_to_ioreq_server_t op;
>      struct domain *d;
>      int rc;
> +    unsigned long gfn = *iter;
>  
>      if ( copy_from_guest(&op, uop, 1) )
>          return -EFAULT;
> @@ -5559,7 +5568,42 @@ static int hvmop_map_mem_type_to_ioreq_server(
>      if ( rc != 0 )
>          goto out;
>  
> -    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
> +    if ( gfn == 0 || op.flags != 0 )
> +        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
> +
> +    /*
> +     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
> +     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
> +     */
> +    if ( op.flags == 0 && rc == 0 )
> +    {
> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +        while ( gfn <= p2m->max_mapped_pfn )
> +        {
> +            p2m_type_t t;
> +
> +            if ( p2m->ioreq.entry_count == 0 )
> +                break;

Any reason not to make this part of the while() condition?

> +
> +            get_gfn_unshare(d, gfn, &t);

This will completely unshare all pages in a domain below the last
dangling p2m_ioreq_server page.  I don't think unsharing is necessary at
all; if it's shared, it certainly won't be of type p2m_ioreq_server.

Actually -- it seems like ept_get_entry() really should be calling
resolve_misconfig(), the same way that ept_set_entry() does.  In that
case, simply going through and reading all the p2m entries would suffice
to finish off the type change.

Although really, it seems like having a "p2m_finish_type_change()"
function which looked for misconfigured entries and reset them would be
a step closer to the right direction, in that it could be re-used in
other situations where the type change may not have finished.

Jan / Yu Zhang, thoughts?

 -George
Jan Beulich Aug. 16, 2016, 1:54 p.m. UTC | #8
>>> On 16.08.16 at 15:35, <george.dunlap@citrix.com> wrote:
> Although really, it seems like having a "p2m_finish_type_change()"
> function which looked for misconfigured entries and reset them would be
> a step closer to the right direction, in that it could be re-used in
> other situations where the type change may not have finished.

That's a good idea imo.

Jan
Yu Zhang Aug. 30, 2016, 12:17 p.m. UTC | #9
On 8/16/2016 9:35 PM, George Dunlap wrote:
> On 12/07/16 10:02, Yu Zhang wrote:
>> This patch resets p2m_ioreq_server entries back to p2m_ram_rw,
>> after an ioreq server has unmapped. The resync is done both
>> asynchronously with the current p2m_change_entry_type_global()
>> interface, and synchronously by iterating the p2m table. The
>> synchronous resetting is necessary because we need to guarantee
>> the p2m table is clean before another ioreq server is mapped.
>> And since the sweeping of p2m table could be time consuming,
>> it is done with hypercall continuation. Asynchronous approach
>> is also taken so that p2m_ioreq_server entries can also be reset
>> when the HVM is scheduled between hypercall continuations.
>>
>> This patch also disallows live migration, when there's still any
>> outstanding p2m_ioreq_server entry left. The core reason is our
>> current implementation of p2m_change_entry_type_global() can not
>> tell the state of p2m_ioreq_server entries(can not decide if an
>> entry is to be emulated or to be resynced).
>>
>> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> Thanks for doing this Yu Zhang!  A couple of comments.

Thanks a lot for your advice, George.
And sorry for the delayed reply(had been in annual leave previously).

>> ---
>> Cc: Paul Durrant <paul.durrant@citrix.com>
>> Cc: Jan Beulich <jbeulich@suse.com>
>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
>> Cc: Jun Nakajima <jun.nakajima@intel.com>
>> Cc: Kevin Tian <kevin.tian@intel.com>
>> ---
>>   xen/arch/x86/hvm/hvm.c    | 52 ++++++++++++++++++++++++++++++++++++++++++++---
>>   xen/arch/x86/mm/hap/hap.c |  9 ++++++++
>>   xen/arch/x86/mm/p2m-ept.c |  6 +++++-
>>   xen/arch/x86/mm/p2m-pt.c  | 10 +++++++--
>>   xen/arch/x86/mm/p2m.c     |  3 +++
>>   xen/include/asm-x86/p2m.h |  5 ++++-
>>   6 files changed, 78 insertions(+), 7 deletions(-)
>>
>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
>> index 4d98cc6..e57c8b9 100644
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -5485,6 +5485,7 @@ static int hvmop_set_mem_type(
>>       {
>>           unsigned long pfn = a.first_pfn + start_iter;
>>           p2m_type_t t;
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>   
>>           get_gfn_unshare(d, pfn, &t);
>>           if ( p2m_is_paging(t) )
>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>           if ( rc )
>>               goto out;
>>   
>> +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
>> +            p2m->ioreq.entry_count++;
>> +
>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
>> +            p2m->ioreq.entry_count--;
> Changing these here might make sense if they were *only* changed in the
> hvm code; but as you also have to modify this value in the p2m code (in
> resolve_misconfig), I think it makes sense to try to do all the counting
> in the p2m code.  That would take care of any locking issues as well.
>
> Logically the most sensible place to do it would be
> atomic_write_ept_entry(); that would make it basically impossible to
> miss a case where we change to of from p2m_ioreq_server.
>
> On the other hand, it would mean adding code to a core path for all p2m
> updates.

Well, the atomic_write_ept_entry() is a rather core path, and is only 
for ept. Although p2m_ioreq_server is
not accepted in shadow page table code, I'd still like to support it in 
AMD SVM - also consistent with the
p2m_change_entry_type_global() functionality. :)

I am considering p2m_change_type_one(), which has gfn_lock/unlock() 
inside. And since previously the p2m
type changes to/from p2m_ioreq_server are only triggered by hvmop which 
leads to p2m_change_type_one(),
I believe calculating the entry_count here would make sense.

Besides, the resolve_misconfig()/do_recalc() still need to decrease the 
entry_count for p2m_ioreq_server,
but since both routines have p2m locked by their caller, so it's also OK.

Other places that read entry_count can be protected by 
read_atomic(&p2m->ioreq.entry_count).

Do you think this acceptable?

>> +
>>           /* Check for continuation if it's not the last interation */
>>           if ( a.nr > ++start_iter && !(start_iter & HVMOP_op_mask) &&
>>                hypercall_preempt_check() )
>> @@ -5530,11 +5537,13 @@ static int hvmop_set_mem_type(
>>   }
>>   
>>   static int hvmop_map_mem_type_to_ioreq_server(
>> -    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
>> +    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
>> +    unsigned long *iter)
>>   {
>>       xen_hvm_map_mem_type_to_ioreq_server_t op;
>>       struct domain *d;
>>       int rc;
>> +    unsigned long gfn = *iter;
>>   
>>       if ( copy_from_guest(&op, uop, 1) )
>>           return -EFAULT;
>> @@ -5559,7 +5568,42 @@ static int hvmop_map_mem_type_to_ioreq_server(
>>       if ( rc != 0 )
>>           goto out;
>>   
>> -    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
>> +    if ( gfn == 0 || op.flags != 0 )
>> +        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
>> +
>> +    /*
>> +     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
>> +     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
>> +     */
>> +    if ( op.flags == 0 && rc == 0 )
>> +    {
>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>> +
>> +        while ( gfn <= p2m->max_mapped_pfn )
>> +        {
>> +            p2m_type_t t;
>> +
>> +            if ( p2m->ioreq.entry_count == 0 )
>> +                break;
> Any reason not to make this part of the while() condition?
>
>> +
>> +            get_gfn_unshare(d, gfn, &t);
> This will completely unshare all pages in a domain below the last
> dangling p2m_ioreq_server page.  I don't think unsharing is necessary at
> all; if it's shared, it certainly won't be of type p2m_ioreq_server.
>
> Actually -- it seems like ept_get_entry() really should be calling
> resolve_misconfig(), the same way that ept_set_entry() does.  In that
> case, simply going through and reading all the p2m entries would suffice
> to finish off the type change.

Well, I have doubts about this - resolve_misconfig() is supposed to 
change p2m table,
and should probably be protected by p2m lock. But ept_get_entry() may be 
triggered
by some routine like get_gfn_query_unlocked().

Besides, although calling resolve_misconfig() will speed up the p2m type 
sweeping,
it also introduces more cost each time we peek the p2m table.

>
> Although really, it seems like having a "p2m_finish_type_change()"
> function which looked for misconfigured entries and reset them would be
> a step closer to the right direction, in that it could be re-used in
> other situations where the type change may not have finished.

Thanks. This sounds a good idea. And I'd like to define the interface of 
this routine
like this:
  void p2m_finish_type_change(struct domain *d,
                            unsigned long start, unsigned long end,
                            p2m_type_t ot, p2m_type_t nt)
So it can be triggered by hvmop_map_mem_type_to_ioreq_server() with 
hypercall continuation
method. Do you think this OK?

In fact, I have worked out a new version of this patch according to my 
understanding, and if you
have no more disagreement on the above issues, I'll send out the new 
patch soon after some test
in XenGT environment. :)

Thanks
Yu
Yu Zhang Sept. 2, 2016, 11 a.m. UTC | #10
On 8/30/2016 8:17 PM, Yu Zhang wrote:
>
>
> On 8/16/2016 9:35 PM, George Dunlap wrote:
>> On 12/07/16 10:02, Yu Zhang wrote:
>>> This patch resets p2m_ioreq_server entries back to p2m_ram_rw,
>>> after an ioreq server has unmapped. The resync is done both
>>> asynchronously with the current p2m_change_entry_type_global()
>>> interface, and synchronously by iterating the p2m table. The
>>> synchronous resetting is necessary because we need to guarantee
>>> the p2m table is clean before another ioreq server is mapped.
>>> And since the sweeping of p2m table could be time consuming,
>>> it is done with hypercall continuation. Asynchronous approach
>>> is also taken so that p2m_ioreq_server entries can also be reset
>>> when the HVM is scheduled between hypercall continuations.
>>>
>>> This patch also disallows live migration, when there's still any
>>> outstanding p2m_ioreq_server entry left. The core reason is our
>>> current implementation of p2m_change_entry_type_global() can not
>>> tell the state of p2m_ioreq_server entries(can not decide if an
>>> entry is to be emulated or to be resynced).
>>>
>>> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
>> Thanks for doing this Yu Zhang!  A couple of comments.
>
> Thanks a lot for your advice, George.
> And sorry for the delayed reply(had been in annual leave previously).
>
>>> ---
>>> Cc: Paul Durrant <paul.durrant@citrix.com>
>>> Cc: Jan Beulich <jbeulich@suse.com>
>>> Cc: Andrew Cooper <andrew.cooper3@citrix.com>
>>> Cc: George Dunlap <george.dunlap@eu.citrix.com>
>>> Cc: Jun Nakajima <jun.nakajima@intel.com>
>>> Cc: Kevin Tian <kevin.tian@intel.com>
>>> ---
>>>   xen/arch/x86/hvm/hvm.c    | 52 
>>> ++++++++++++++++++++++++++++++++++++++++++++---
>>>   xen/arch/x86/mm/hap/hap.c |  9 ++++++++
>>>   xen/arch/x86/mm/p2m-ept.c |  6 +++++-
>>>   xen/arch/x86/mm/p2m-pt.c  | 10 +++++++--
>>>   xen/arch/x86/mm/p2m.c     |  3 +++
>>>   xen/include/asm-x86/p2m.h |  5 ++++-
>>>   6 files changed, 78 insertions(+), 7 deletions(-)
>>>
>>> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
>>> index 4d98cc6..e57c8b9 100644
>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -5485,6 +5485,7 @@ static int hvmop_set_mem_type(
>>>       {
>>>           unsigned long pfn = a.first_pfn + start_iter;
>>>           p2m_type_t t;
>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>>             get_gfn_unshare(d, pfn, &t);
>>>           if ( p2m_is_paging(t) )
>>> @@ -5512,6 +5513,12 @@ static int hvmop_set_mem_type(
>>>           if ( rc )
>>>               goto out;
>>>   +        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == 
>>> p2m_ioreq_server )
>>> +            p2m->ioreq.entry_count++;
>>> +
>>> +        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == 
>>> p2m_ram_rw )
>>> +            p2m->ioreq.entry_count--;
>> Changing these here might make sense if they were *only* changed in the
>> hvm code; but as you also have to modify this value in the p2m code (in
>> resolve_misconfig), I think it makes sense to try to do all the counting
>> in the p2m code.  That would take care of any locking issues as well.
>>
>> Logically the most sensible place to do it would be
>> atomic_write_ept_entry(); that would make it basically impossible to
>> miss a case where we change to of from p2m_ioreq_server.
>>
>> On the other hand, it would mean adding code to a core path for all p2m
>> updates.
>
> Well, the atomic_write_ept_entry() is a rather core path, and is only 
> for ept. Although p2m_ioreq_server is
> not accepted in shadow page table code, I'd still like to support it 
> in AMD SVM - also consistent with the
> p2m_change_entry_type_global() functionality. :)
>
> I am considering p2m_change_type_one(), which has gfn_lock/unlock() 
> inside. And since previously the p2m
> type changes to/from p2m_ioreq_server are only triggered by hvmop 
> which leads to p2m_change_type_one(),
> I believe calculating the entry_count here would make sense.
>
> Besides, the resolve_misconfig()/do_recalc() still need to decrease 
> the entry_count for p2m_ioreq_server,
> but since both routines have p2m locked by their caller, so it's also OK.
>
> Other places that read entry_count can be protected by 
> read_atomic(&p2m->ioreq.entry_count).
>
> Do you think this acceptable?
>
>>> +
>>>           /* Check for continuation if it's not the last interation */
>>>           if ( a.nr > ++start_iter && !(start_iter & HVMOP_op_mask) &&
>>>                hypercall_preempt_check() )
>>> @@ -5530,11 +5537,13 @@ static int hvmop_set_mem_type(
>>>   }
>>>     static int hvmop_map_mem_type_to_ioreq_server(
>>> - XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
>>> + XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
>>> +    unsigned long *iter)
>>>   {
>>>       xen_hvm_map_mem_type_to_ioreq_server_t op;
>>>       struct domain *d;
>>>       int rc;
>>> +    unsigned long gfn = *iter;
>>>         if ( copy_from_guest(&op, uop, 1) )
>>>           return -EFAULT;
>>> @@ -5559,7 +5568,42 @@ static int hvmop_map_mem_type_to_ioreq_server(
>>>       if ( rc != 0 )
>>>           goto out;
>>>   -    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, 
>>> op.flags);
>>> +    if ( gfn == 0 || op.flags != 0 )
>>> +        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, 
>>> op.flags);
>>> +
>>> +    /*
>>> +     * Iterate p2m table when an ioreq server unmaps from 
>>> p2m_ioreq_server,
>>> +     * and reset the remaining p2m_ioreq_server entries back to 
>>> p2m_ram_rw.
>>> +     */
>>> +    if ( op.flags == 0 && rc == 0 )
>>> +    {
>>> +        struct p2m_domain *p2m = p2m_get_hostp2m(d);
>>> +
>>> +        while ( gfn <= p2m->max_mapped_pfn )
>>> +        {
>>> +            p2m_type_t t;
>>> +
>>> +            if ( p2m->ioreq.entry_count == 0 )
>>> +                break;
>> Any reason not to make this part of the while() condition?
>>
>>> +
>>> +            get_gfn_unshare(d, gfn, &t);
>> This will completely unshare all pages in a domain below the last
>> dangling p2m_ioreq_server page.  I don't think unsharing is necessary at
>> all; if it's shared, it certainly won't be of type p2m_ioreq_server.
>>
>> Actually -- it seems like ept_get_entry() really should be calling
>> resolve_misconfig(), the same way that ept_set_entry() does.  In that
>> case, simply going through and reading all the p2m entries would suffice
>> to finish off the type change.
>
> Well, I have doubts about this - resolve_misconfig() is supposed to 
> change p2m table,
> and should probably be protected by p2m lock. But ept_get_entry() may 
> be triggered
> by some routine like get_gfn_query_unlocked().
>
> Besides, although calling resolve_misconfig() will speed up the p2m 
> type sweeping,
> it also introduces more cost each time we peek the p2m table.
>
>>
>> Although really, it seems like having a "p2m_finish_type_change()"
>> function which looked for misconfigured entries and reset them would be
>> a step closer to the right direction, in that it could be re-used in
>> other situations where the type change may not have finished.
>
> Thanks. This sounds a good idea. And I'd like to define the interface 
> of this routine
> like this:
>  void p2m_finish_type_change(struct domain *d,
>                            unsigned long start, unsigned long end,
>                            p2m_type_t ot, p2m_type_t nt)
> So it can be triggered by hvmop_map_mem_type_to_ioreq_server() with 
> hypercall continuation
> method. Do you think this OK?
>
> In fact, I have worked out a new version of this patch according to my 
> understanding, and if you
> have no more disagreement on the above issues, I'll send out the new 
> patch soon after some test
> in XenGT environment. :)
>

Hi George, I just send out the v6 patch set, you can have a look on that 
version directly when you
have time. Thanks! :)

Yu
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 4d98cc6..e57c8b9 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -5485,6 +5485,7 @@  static int hvmop_set_mem_type(
     {
         unsigned long pfn = a.first_pfn + start_iter;
         p2m_type_t t;
+        struct p2m_domain *p2m = p2m_get_hostp2m(d);
 
         get_gfn_unshare(d, pfn, &t);
         if ( p2m_is_paging(t) )
@@ -5512,6 +5513,12 @@  static int hvmop_set_mem_type(
         if ( rc )
             goto out;
 
+        if ( t == p2m_ram_rw && memtype[a.hvmmem_type] == p2m_ioreq_server )
+            p2m->ioreq.entry_count++;
+
+        if ( t == p2m_ioreq_server && memtype[a.hvmmem_type] == p2m_ram_rw )
+            p2m->ioreq.entry_count--;
+
         /* Check for continuation if it's not the last interation */
         if ( a.nr > ++start_iter && !(start_iter & HVMOP_op_mask) &&
              hypercall_preempt_check() )
@@ -5530,11 +5537,13 @@  static int hvmop_set_mem_type(
 }
 
 static int hvmop_map_mem_type_to_ioreq_server(
-    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop,
+    unsigned long *iter)
 {
     xen_hvm_map_mem_type_to_ioreq_server_t op;
     struct domain *d;
     int rc;
+    unsigned long gfn = *iter;
 
     if ( copy_from_guest(&op, uop, 1) )
         return -EFAULT;
@@ -5559,7 +5568,42 @@  static int hvmop_map_mem_type_to_ioreq_server(
     if ( rc != 0 )
         goto out;
 
-    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
+    if ( gfn == 0 || op.flags != 0 )
+        rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
+
+    /*
+     * Iterate p2m table when an ioreq server unmaps from p2m_ioreq_server,
+     * and reset the remaining p2m_ioreq_server entries back to p2m_ram_rw.
+     */
+    if ( op.flags == 0 && rc == 0 )
+    {
+        struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+        while ( gfn <= p2m->max_mapped_pfn )
+        {
+            p2m_type_t t;
+
+            if ( p2m->ioreq.entry_count == 0 )
+                break;
+
+            get_gfn_unshare(d, gfn, &t);
+
+            if ( (t == p2m_ioreq_server) &&
+                 !(p2m_change_type_one(d, gfn, t, p2m_ram_rw)) )
+                p2m->ioreq.entry_count--;
+
+            put_gfn(d, gfn);
+
+            /* Check for continuation if it's not the last iteration. */
+            if ( ++gfn <= p2m->max_mapped_pfn &&
+                 !(gfn & HVMOP_op_mask) &&
+                 hypercall_preempt_check() )
+            {
+                rc = -ERESTART;
+                goto out;
+            }
+        }
+    }
 
  out:
     rcu_unlock_domain(d);
@@ -5578,6 +5622,7 @@  long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
         break;
     case HVMOP_modified_memory:
     case HVMOP_set_mem_type:
+    case HVMOP_map_mem_type_to_ioreq_server:
         mask = HVMOP_op_mask;
         break;
     }
@@ -5607,7 +5652,8 @@  long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
 
     case HVMOP_map_mem_type_to_ioreq_server:
         rc = hvmop_map_mem_type_to_ioreq_server(
-            guest_handle_cast(arg, xen_hvm_map_mem_type_to_ioreq_server_t));
+            guest_handle_cast(arg, xen_hvm_map_mem_type_to_ioreq_server_t),
+            &start_iter);
         break;
 
     case HVMOP_set_ioreq_server_state:
diff --git a/xen/arch/x86/mm/hap/hap.c b/xen/arch/x86/mm/hap/hap.c
index 9c2cd49..0442b1d 100644
--- a/xen/arch/x86/mm/hap/hap.c
+++ b/xen/arch/x86/mm/hap/hap.c
@@ -190,6 +190,15 @@  out:
  */
 static int hap_enable_log_dirty(struct domain *d, bool_t log_global)
 {
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    /*
+    * Refuse to turn on global log-dirty mode if
+    * there's outstanding p2m_ioreq_server pages.
+    */
+    if ( log_global && p2m->ioreq.entry_count > 0 )
+        return -EBUSY;
+
     /* turn on PG_log_dirty bit in paging mode */
     paging_lock(d);
     d->arch.paging.mode |= PG_log_dirty;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 5f06d40..5d4d804 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -545,6 +545,9 @@  static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
                     e.ipat = ipat;
                     if ( e.recalc && p2m_is_changeable(e.sa_p2mt) )
                     {
+                         if ( e.sa_p2mt == p2m_ioreq_server )
+                             p2m->ioreq.entry_count--;
+
                          e.sa_p2mt = p2m_is_logdirty_range(p2m, gfn + i, gfn + i)
                                      ? p2m_ram_logdirty : p2m_ram_rw;
                          ept_p2m_type_to_flags(p2m, &e, e.sa_p2mt, e.access);
@@ -965,7 +968,8 @@  static mfn_t ept_get_entry(struct p2m_domain *p2m,
     if ( is_epte_valid(ept_entry) )
     {
         if ( (recalc || ept_entry->recalc) &&
-             p2m_is_changeable(ept_entry->sa_p2mt) )
+             p2m_is_changeable(ept_entry->sa_p2mt) &&
+             (ept_entry->sa_p2mt != p2m_ioreq_server) )
             *t = p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
                                                       : p2m_ram_rw;
         else
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index 6209e7b..7bebfd1 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -439,11 +439,13 @@  static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
          needs_recalc(l1, *pent) )
     {
         l1_pgentry_t e = *pent;
+        p2m_type_t p2mt_old;
 
         if ( !valid_recalc(l1, e) )
             P2M_DEBUG("bogus recalc leaf at d%d:%lx:%u\n",
                       p2m->domain->domain_id, gfn, level);
-        if ( p2m_is_changeable(p2m_flags_to_type(l1e_get_flags(e))) )
+        p2mt_old = p2m_flags_to_type(l1e_get_flags(e));
+        if ( p2m_is_changeable(p2mt_old) )
         {
             unsigned long mask = ~0UL << (level * PAGETABLE_ORDER);
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
@@ -463,6 +465,10 @@  static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
                      mfn &= ~(_PAGE_PSE_PAT >> PAGE_SHIFT);
                 flags |= _PAGE_PSE;
             }
+
+            if ( p2mt_old == p2m_ioreq_server )
+                p2m->ioreq.entry_count--;
+
             e = l1e_from_pfn(mfn, flags);
             p2m_add_iommu_flags(&e, level,
                                 (p2mt == p2m_ram_rw)
@@ -729,7 +735,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
 static inline p2m_type_t recalc_type(bool_t recalc, p2m_type_t t,
                                      struct p2m_domain *p2m, unsigned long gfn)
 {
-    if ( !recalc || !p2m_is_changeable(t) )
+    if ( !recalc || !p2m_is_changeable(t) || (t == p2m_ioreq_server) )
         return t;
     return p2m_is_logdirty_range(p2m, gfn, gfn) ? p2m_ram_logdirty
                                                 : p2m_ram_rw;
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index 5567181..e1c3e31 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -313,6 +313,9 @@  int p2m_set_ioreq_server(struct domain *d,
 
         p2m->ioreq.server = NULL;
         p2m->ioreq.flags = 0;
+
+        if ( p2m->ioreq.entry_count > 0 )
+            p2m_change_entry_type_global(d, p2m_ioreq_server, p2m_ram_rw);
     }
     else
     {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 0950a91..8e5b4f5 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -120,7 +120,8 @@  typedef unsigned int p2m_query_t;
 
 /* Types that can be subject to bulk transitions. */
 #define P2M_CHANGEABLE_TYPES (p2m_to_mask(p2m_ram_rw) \
-                              | p2m_to_mask(p2m_ram_logdirty) )
+                              | p2m_to_mask(p2m_ram_logdirty) \
+                              | p2m_to_mask(p2m_ioreq_server))
 
 #define P2M_POD_TYPES (p2m_to_mask(p2m_populate_on_demand))
 
@@ -352,6 +353,8 @@  struct p2m_domain {
 
 #define P2M_IOREQ_HANDLE_WRITE_ACCESS XEN_HVMOP_IOREQ_MEM_ACCESS_WRITE
 #define P2M_IOREQ_HANDLE_READ_ACCESS  XEN_HVMOP_IOREQ_MEM_ACCESS_READ
+
+        unsigned int entry_count;
     } ioreq;
 };