diff mbox

[3/3] Add HVMOP to map guest ram with p2m_ioreq_server to an ioreq server

Message ID 1458130960-30109-1-git-send-email-yu.c.zhang@linux.intel.com (mailing list archive)
State New, archived
Headers show

Commit Message

Yu Zhang March 16, 2016, 12:22 p.m. UTC
A new HVMOP - HVMOP_map_mem_type_to_ioreq_server, is added to
let one ioreq server claim its responsibility for the handling
of guest pages with p2m type p2m_ioreq_server. Users of this
HVMOP can specify whether the p2m_ioreq_server is supposed to
handle write accesses or read ones in a flag. By now, we only
support one ioreq server for this p2m type, so once an ioreq
server has claimed its ownership, following calls of the
HVMOP_map_mem_type_to_ioreq_server will fail.

Note: this HVMOP does not change the p2m type of any guest ram
page, until the HVMOP_set_mem_type is triggered. So normally
the steps would be the backend driver first claims its ownership
of guest ram pages with p2m_ioreq_server type. At then sets the
memory type to p2m_ioreq_server for specified guest ram pages.

Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
---
 xen/arch/x86/hvm/emulate.c       | 118 +++++++++++++++++++++++++++++++++++++--
 xen/arch/x86/hvm/hvm.c           |  92 +++++++++++++++++++++++++++++-
 xen/arch/x86/mm/hap/nested_hap.c |   2 +-
 xen/arch/x86/mm/p2m-ept.c        |   9 ++-
 xen/arch/x86/mm/p2m-pt.c         |  25 ++++++---
 xen/arch/x86/mm/p2m.c            |  79 ++++++++++++++++++++++++++
 xen/arch/x86/mm/shadow/multi.c   |  26 ++++++++-
 xen/include/asm-x86/p2m.h        |  19 ++++++-
 xen/include/public/hvm/hvm_op.h  |  24 ++++++++
 9 files changed, 374 insertions(+), 20 deletions(-)

Comments

George Dunlap March 22, 2016, 5:26 p.m. UTC | #1
On 16/03/16 12:22, Yu Zhang wrote:
> A new HVMOP - HVMOP_map_mem_type_to_ioreq_server, is added to
> let one ioreq server claim its responsibility for the handling
> of guest pages with p2m type p2m_ioreq_server. Users of this
> HVMOP can specify whether the p2m_ioreq_server is supposed to
> handle write accesses or read ones in a flag. By now, we only
> support one ioreq server for this p2m type, so once an ioreq
> server has claimed its ownership, following calls of the
> HVMOP_map_mem_type_to_ioreq_server will fail.
> 
> Note: this HVMOP does not change the p2m type of any guest ram
> page, until the HVMOP_set_mem_type is triggered. So normally
> the steps would be the backend driver first claims its ownership
> of guest ram pages with p2m_ioreq_server type. At then sets the
> memory type to p2m_ioreq_server for specified guest ram pages.

Yu, thanks for this work.  I think it's heading in the right direction.

A couple of comments:

There's not much documentation in the code about how this is expected to
be used.

For instance, having separate flags seems to imply that you can
independently select either read intercept, write intercept, or both;
but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
WRITE_ACCESS.  Do you plan to implement them separately in the future?
If not, would it be better to make the interface an enum instead?

At very least it should be documented that READ_ACCESS implies WRITE_ACCESS.

Calling the function with no flags appears to mean, "Detach the ioreq
server from this type" -- this should also be documented.

Furthermore, you appear to deal with p2m_ioreq_server types when there
is no ioreq server by making the "flags=0" case RW in
[ept_]p2m_type_to_flags.  This should be spelled out somewhere (probably
in the p2m structure definition).

Finally, you call p2m_memory_type_changed() when changing the ioreq
server; but this appears to be only implemented for ept; meaning that if
you're either running HAP on AMD or running in shadow mode, if the
caller isn't careful (i.e., doesn't remove all p2m_ioreq_server types
before removing itself as a server), they may end up with bogus
permissions left over in the p2m table.

Maybe you should check for the existence of p2m->memory_type_changed and
return -ENOTSUPP or something if it's not present?

More comments inline...

> Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>

> @@ -168,13 +226,65 @@ static int hvmemul_do_io(
>          break;
>      case X86EMUL_UNHANDLEABLE:
>      {
> -        struct hvm_ioreq_server *s =
> -            hvm_select_ioreq_server(curr->domain, &p);
> +        struct hvm_ioreq_server *s;
> +        p2m_type_t p2mt;
> +
> +        if ( is_mmio )
> +        {
> +            unsigned long gmfn = paddr_to_pfn(addr);
> +
> +            (void) get_gfn_query_unlocked(currd, gmfn, &p2mt);
> +
> +            switch ( p2mt )
> +            {
> +                case p2m_ioreq_server:
> +                {
> +                    unsigned long flags;
> +
> +                    p2m_get_ioreq_server(currd, p2mt, &flags, &s);
> +
> +                    if ( !s )
> +                        break;
> +
> +                    if ( (dir == IOREQ_READ &&
> +                          !(flags & P2M_IOREQ_HANDLE_READ_ACCESS)) ||
> +                         (dir == IOREQ_WRITE &&
> +                          !(flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) )
> +                        s = NULL;
> +
> +                    break;
> +                }
> +                case p2m_ram_rw:
> +                    s = NULL;
> +                    break;
> +
> +                default:
> +                    s = hvm_select_ioreq_server(currd, &p);
> +                    break;
> +            }
> +        }
> +        else
> +        {
> +            p2mt = p2m_invalid;
> +
> +            s = hvm_select_ioreq_server(currd, &p);
> +        }
>  
>          /* If there is no suitable backing DM, just ignore accesses */
>          if ( !s )
>          {
> -            rc = hvm_process_io_intercept(&null_handler, &p);
> +            switch ( p2mt )
> +            {
> +            case p2m_ioreq_server:
> +            case p2m_ram_rw:
> +                rc = hvm_process_io_intercept(&mem_handler, &p);
> +                break;
> +
> +            default:
> +                rc = hvm_process_io_intercept(&null_handler, &p);
> +                break;
> +            }

Is it actually possible to get here with "is_mmio" true and p2mt ==
p2m_ram_rw?

If so, it would appear that this changes the handling in that case.
Before this patch, on p2m_ram_rw, you'd call hvm_select_ioreq_server(),
which would either give you a server (perhaps the default one), or you'd
be called with null_handler.

After this patch, on p2m_ram_rw, you'll always be called with mem_handler.

If this is an intentional change, you need to explan why in the
changelog (and probably also a comment here).

> @@ -1411,6 +1413,55 @@ static int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
>      return rc;
>  }
>  
> +static int hvm_map_mem_type_to_ioreq_server(struct domain *d,
> +                                            ioservid_t id,
> +                                            hvmmem_type_t type,
> +                                            uint32_t flags)
> +{
> +    struct hvm_ioreq_server *s;
> +    p2m_type_t p2mt;
> +    int rc;
> +
> +    switch ( type )
> +    {
> +    case HVMMEM_ioreq_server:
> +        p2mt = p2m_ioreq_server;
> +        break;
> +
> +    default:
> +        return -EINVAL;
> +    }
> +
> +    if ( flags & ~(HVMOP_IOREQ_MEM_ACCESS_READ |
> +                   HVMOP_IOREQ_MEM_ACCESS_WRITE) )
> +        return -EINVAL;
> +
> +    spin_lock(&d->arch.hvm_domain.ioreq_server.lock);
> +
> +    rc = -ENOENT;
> +    list_for_each_entry ( s,
> +                          &d->arch.hvm_domain.ioreq_server.list,
> +                          list_entry )
> +    {
> +        if ( s == d->arch.hvm_domain.default_ioreq_server )
> +            continue;
> +
> +        if ( s->id == id )
> +        {
> +            rc = p2m_set_ioreq_server(d, p2mt, flags, s);
> +            if ( rc )
> +                break;
> +
> +            gdprintk(XENLOG_DEBUG, "%u claimed type p2m_ioreq_server\n",
> +                     s->id);

Don't we want to break out of the loop if p2m_set_ioreq_server()
succeeds as well?  Or do we really want to go check all the other ioreq
servers to see if their ID matches too? :-)

> diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> index 380ec25..21e04ce 100644
> --- a/xen/arch/x86/mm/p2m-ept.c
> +++ b/xen/arch/x86/mm/p2m-ept.c
> @@ -132,6 +132,14 @@ static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
>              entry->r = entry->w = entry->x = 1;
>              entry->a = entry->d = !!cpu_has_vmx_ept_ad;
>              break;
> +        case p2m_ioreq_server:
> +            entry->r = !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS);
> +            entry->w = (entry->r &
> +                        !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS));
> +            entry->x = entry->r;
> +            entry->a = !!cpu_has_vmx_ept_ad;
> +            entry->d = 0;
> +            break;

In line with the other types, would it make sense for entry->d here to
be entry->w && cpu_has_vmx_ept_ad?


> @@ -289,6 +291,83 @@ void p2m_memory_type_changed(struct domain *d)
>      }
>  }
>  
> +int p2m_set_ioreq_server(struct domain *d, p2m_type_t t,
> +                         unsigned long flags,
> +                         struct hvm_ioreq_server *s)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +    int rc;
> +
> +    spin_lock(&p2m->ioreq.lock);
> +
> +    rc = -EINVAL;
> +    if ( t != p2m_ioreq_server )
> +        goto out;

Since these are all internal interfaces, I'm not sure what the point is
of passing this type in when we only accept one type anyway.  If we're
going to have a type in the interface but only accept one type, we
should just check it at the top level and assume p2m_ioreq_server all
the way through (until such time as we accept a second type).

> +
> +    rc = -EBUSY;
> +    if ( ( p2m->ioreq.server != NULL ) && (flags != 0) )
> +        goto out;
> +
> +    if ( flags == 0 )
> +    {
> +        p2m->ioreq.server = NULL;
> +        p2m->ioreq.flags = 0;
> +    }
> +    else
> +    {
> +        p2m->ioreq.server = s;
> +        p2m->ioreq.flags = flags;
> +    }

I think it would be a bit more robust to

1) Require callers to always specify (their own) server ID when making
this call, even if they're un-claiming the p2m type

2) When un-claiming, check to make sure that the server id of the caller
matches the server id that's being detached.

> +
> +    p2m_memory_type_changed(d);
> +
> +    rc = 0;
> +
> +out:
> +    spin_unlock(&p2m->ioreq.lock);
> +
> +    return rc;
> +}
> +
> +void p2m_get_ioreq_server(struct domain *d, p2m_type_t t,
> +                          unsigned long *flags,
> +                          struct hvm_ioreq_server **s)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    if ( t != p2m_ioreq_server )
> +    {
> +        *s = NULL;
> +        *flags = 0;
> +        return;
> +    }
> +
> +    spin_lock(&p2m->ioreq.lock);
> +
> +    *s = p2m->ioreq.server;
> +    *flags = p2m->ioreq.flags;
> +
> +    spin_unlock(&p2m->ioreq.lock);
> +}
> +
> +void p2m_destroy_ioreq_server(struct domain *d,
> +                              struct hvm_ioreq_server *s)
> +{
> +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> +
> +    spin_lock(&p2m->ioreq.lock);
> +
> +    if ( p2m->ioreq.server == s )
> +    {
> +        p2m->ioreq.server = NULL;
> +        p2m->ioreq.flags = 0;
> +    }
> +
> +    p2m_memory_type_changed(d);

Do we really want to call this unconditionally, even if the server being
destroyed here isn't the server associated with p2m_ioreq_server?  It
shouldn't have a functional impact, but it will slow things down as the
ept tables are unnecessarily rebuilt.

> diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
> index a1eae52..e7fc75d 100644
> --- a/xen/include/public/hvm/hvm_op.h
> +++ b/xen/include/public/hvm/hvm_op.h
> @@ -489,6 +489,30 @@ struct xen_hvm_altp2m_op {
>  typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
>  DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
>  
> +#if defined(__XEN__) || defined(__XEN_TOOLS__)
> +
> +#define HVMOP_map_mem_type_to_ioreq_server 26
> +struct xen_hvm_map_mem_type_to_ioreq_server {
> +    domid_t domid;      /* IN - domain to be serviced */
> +    ioservid_t id;      /* IN - server id */
> +    hvmmem_type_t type; /* IN - memory type */

What is this 'type' for?  Do you expect to map other types of memory
this way in the future?

 -George
Paul Durrant March 22, 2016, 5:51 p.m. UTC | #2
> -----Original Message-----
> From: George Dunlap [mailto:george.dunlap@citrix.com]
> Sent: 22 March 2016 17:27
> To: Yu Zhang; xen-devel@lists.xen.org
> Cc: Keir (Xen.org); jbeulich@suse.com; Andrew Cooper; George Dunlap;
> jun.nakajima@intel.com; Kevin Tian; Tim (Xen.org); Paul Durrant;
> zhiyuan.lv@intel.com
> Subject: Re: [PATCH 3/3] Add HVMOP to map guest ram with
> p2m_ioreq_server to an ioreq server
> 
> On 16/03/16 12:22, Yu Zhang wrote:
> > A new HVMOP - HVMOP_map_mem_type_to_ioreq_server, is added to
> > let one ioreq server claim its responsibility for the handling
> > of guest pages with p2m type p2m_ioreq_server. Users of this
> > HVMOP can specify whether the p2m_ioreq_server is supposed to
> > handle write accesses or read ones in a flag. By now, we only
> > support one ioreq server for this p2m type, so once an ioreq
> > server has claimed its ownership, following calls of the
> > HVMOP_map_mem_type_to_ioreq_server will fail.
> >
> > Note: this HVMOP does not change the p2m type of any guest ram
> > page, until the HVMOP_set_mem_type is triggered. So normally
> > the steps would be the backend driver first claims its ownership
> > of guest ram pages with p2m_ioreq_server type. At then sets the
> > memory type to p2m_ioreq_server for specified guest ram pages.
> 
> Yu, thanks for this work.  I think it's heading in the right direction.
> 
> A couple of comments:
> 
> There's not much documentation in the code about how this is expected to
> be used.
> 
> For instance, having separate flags seems to imply that you can
> independently select either read intercept, write intercept, or both;
> but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
> WRITE_ACCESS.  Do you plan to implement them separately in the future?
> If not, would it be better to make the interface an enum instead?
> 
> At very least it should be documented that READ_ACCESS implies
> WRITE_ACCESS.

That's not true. If WRITE_ACCESS has not been requested then writes are handled directly in Xen rather than being forwarded to the ioreq server. If h/w were to allow pages to be marked write-only then we wouldn't need to do that.

> 
> Calling the function with no flags appears to mean, "Detach the ioreq
> server from this type" -- this should also be documented.
> 

That was documented in the design, but it's true that it does need to pulled into comments.

> Furthermore, you appear to deal with p2m_ioreq_server types when there
> is no ioreq server by making the "flags=0" case RW in
> [ept_]p2m_type_to_flags.  This should be spelled out somewhere (probably
> in the p2m structure definition).
> 
> Finally, you call p2m_memory_type_changed() when changing the ioreq
> server; but this appears to be only implemented for ept; meaning that if
> you're either running HAP on AMD or running in shadow mode, if the
> caller isn't careful (i.e., doesn't remove all p2m_ioreq_server types
> before removing itself as a server), they may end up with bogus
> permissions left over in the p2m table.
> 
> Maybe you should check for the existence of p2m->memory_type_changed
> and
> return -ENOTSUPP or something if it's not present?
> 
> More comments inline...
> 
> > Signed-off-by: Paul Durrant <paul.durrant@citrix.com>
> > Signed-off-by: Yu Zhang <yu.c.zhang@linux.intel.com>
> 
> > @@ -168,13 +226,65 @@ static int hvmemul_do_io(
> >          break;
> >      case X86EMUL_UNHANDLEABLE:
> >      {
> > -        struct hvm_ioreq_server *s =
> > -            hvm_select_ioreq_server(curr->domain, &p);
> > +        struct hvm_ioreq_server *s;
> > +        p2m_type_t p2mt;
> > +
> > +        if ( is_mmio )
> > +        {
> > +            unsigned long gmfn = paddr_to_pfn(addr);
> > +
> > +            (void) get_gfn_query_unlocked(currd, gmfn, &p2mt);
> > +
> > +            switch ( p2mt )
> > +            {
> > +                case p2m_ioreq_server:
> > +                {
> > +                    unsigned long flags;
> > +
> > +                    p2m_get_ioreq_server(currd, p2mt, &flags, &s);
> > +
> > +                    if ( !s )
> > +                        break;
> > +
> > +                    if ( (dir == IOREQ_READ &&
> > +                          !(flags & P2M_IOREQ_HANDLE_READ_ACCESS)) ||
> > +                         (dir == IOREQ_WRITE &&
> > +                          !(flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) )
> > +                        s = NULL;
> > +
> > +                    break;
> > +                }
> > +                case p2m_ram_rw:
> > +                    s = NULL;
> > +                    break;
> > +
> > +                default:
> > +                    s = hvm_select_ioreq_server(currd, &p);
> > +                    break;
> > +            }
> > +        }
> > +        else
> > +        {
> > +            p2mt = p2m_invalid;
> > +
> > +            s = hvm_select_ioreq_server(currd, &p);
> > +        }
> >
> >          /* If there is no suitable backing DM, just ignore accesses */
> >          if ( !s )
> >          {
> > -            rc = hvm_process_io_intercept(&null_handler, &p);
> > +            switch ( p2mt )
> > +            {
> > +            case p2m_ioreq_server:
> > +            case p2m_ram_rw:
> > +                rc = hvm_process_io_intercept(&mem_handler, &p);
> > +                break;
> > +
> > +            default:
> > +                rc = hvm_process_io_intercept(&null_handler, &p);
> > +                break;
> > +            }
> 
> Is it actually possible to get here with "is_mmio" true and p2mt ==
> p2m_ram_rw?

I think that's possible if the type change races with an access.

> 
> If so, it would appear that this changes the handling in that case.
> Before this patch, on p2m_ram_rw, you'd call hvm_select_ioreq_server(),
> which would either give you a server (perhaps the default one), or you'd
> be called with null_handler.
> 
> After this patch, on p2m_ram_rw, you'll always be called with mem_handler.
> 
> If this is an intentional change, you need to explan why in the
> changelog (and probably also a comment here).
> 

That is an intentional change.

> > @@ -1411,6 +1413,55 @@ static int
> hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
> >      return rc;
> >  }
> >
> > +static int hvm_map_mem_type_to_ioreq_server(struct domain *d,
> > +                                            ioservid_t id,
> > +                                            hvmmem_type_t type,
> > +                                            uint32_t flags)
> > +{
> > +    struct hvm_ioreq_server *s;
> > +    p2m_type_t p2mt;
> > +    int rc;
> > +
> > +    switch ( type )
> > +    {
> > +    case HVMMEM_ioreq_server:
> > +        p2mt = p2m_ioreq_server;
> > +        break;
> > +
> > +    default:
> > +        return -EINVAL;
> > +    }
> > +
> > +    if ( flags & ~(HVMOP_IOREQ_MEM_ACCESS_READ |
> > +                   HVMOP_IOREQ_MEM_ACCESS_WRITE) )
> > +        return -EINVAL;
> > +
> > +    spin_lock(&d->arch.hvm_domain.ioreq_server.lock);
> > +
> > +    rc = -ENOENT;
> > +    list_for_each_entry ( s,
> > +                          &d->arch.hvm_domain.ioreq_server.list,
> > +                          list_entry )
> > +    {
> > +        if ( s == d->arch.hvm_domain.default_ioreq_server )
> > +            continue;
> > +
> > +        if ( s->id == id )
> > +        {
> > +            rc = p2m_set_ioreq_server(d, p2mt, flags, s);
> > +            if ( rc )
> > +                break;
> > +
> > +            gdprintk(XENLOG_DEBUG, "%u claimed type p2m_ioreq_server\n",
> > +                     s->id);
> 
> Don't we want to break out of the loop if p2m_set_ioreq_server()
> succeeds as well?  Or do we really want to go check all the other ioreq
> servers to see if their ID matches too? :-)

No, the break should be unconditional.

> 
> > diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
> > index 380ec25..21e04ce 100644
> > --- a/xen/arch/x86/mm/p2m-ept.c
> > +++ b/xen/arch/x86/mm/p2m-ept.c
> > @@ -132,6 +132,14 @@ static void ept_p2m_type_to_flags(struct
> p2m_domain *p2m, ept_entry_t *entry,
> >              entry->r = entry->w = entry->x = 1;
> >              entry->a = entry->d = !!cpu_has_vmx_ept_ad;
> >              break;
> > +        case p2m_ioreq_server:
> > +            entry->r = !(p2m->ioreq.flags &
> P2M_IOREQ_HANDLE_READ_ACCESS);
> > +            entry->w = (entry->r &
> > +                        !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS));
> > +            entry->x = entry->r;
> > +            entry->a = !!cpu_has_vmx_ept_ad;
> > +            entry->d = 0;
> > +            break;
> 
> In line with the other types, would it make sense for entry->d here to
> be entry->w && cpu_has_vmx_ept_ad?
> 
> 
> > @@ -289,6 +291,83 @@ void p2m_memory_type_changed(struct domain
> *d)
> >      }
> >  }
> >
> > +int p2m_set_ioreq_server(struct domain *d, p2m_type_t t,
> > +                         unsigned long flags,
> > +                         struct hvm_ioreq_server *s)
> > +{
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> > +    int rc;
> > +
> > +    spin_lock(&p2m->ioreq.lock);
> > +
> > +    rc = -EINVAL;
> > +    if ( t != p2m_ioreq_server )
> > +        goto out;
> 
> Since these are all internal interfaces, I'm not sure what the point is
> of passing this type in when we only accept one type anyway.  If we're
> going to have a type in the interface but only accept one type, we
> should just check it at the top level and assume p2m_ioreq_server all
> the way through (until such time as we accept a second type).
> 

OK. It was there for future expansion, but that could be added later.

> > +
> > +    rc = -EBUSY;
> > +    if ( ( p2m->ioreq.server != NULL ) && (flags != 0) )
> > +        goto out;
> > +
> > +    if ( flags == 0 )
> > +    {
> > +        p2m->ioreq.server = NULL;
> > +        p2m->ioreq.flags = 0;
> > +    }
> > +    else
> > +    {
> > +        p2m->ioreq.server = s;
> > +        p2m->ioreq.flags = flags;
> > +    }
> 
> I think it would be a bit more robust to
> 
> 1) Require callers to always specify (their own) server ID when making
> this call, even if they're un-claiming the p2m type
> 
> 2) When un-claiming, check to make sure that the server id of the caller
> matches the server id that's being detached.
> 

Ok. Fair enough.

> > +
> > +    p2m_memory_type_changed(d);
> > +
> > +    rc = 0;
> > +
> > +out:
> > +    spin_unlock(&p2m->ioreq.lock);
> > +
> > +    return rc;
> > +}
> > +
> > +void p2m_get_ioreq_server(struct domain *d, p2m_type_t t,
> > +                          unsigned long *flags,
> > +                          struct hvm_ioreq_server **s)
> > +{
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> > +
> > +    if ( t != p2m_ioreq_server )
> > +    {
> > +        *s = NULL;
> > +        *flags = 0;
> > +        return;
> > +    }
> > +
> > +    spin_lock(&p2m->ioreq.lock);
> > +
> > +    *s = p2m->ioreq.server;
> > +    *flags = p2m->ioreq.flags;
> > +
> > +    spin_unlock(&p2m->ioreq.lock);
> > +}
> > +
> > +void p2m_destroy_ioreq_server(struct domain *d,
> > +                              struct hvm_ioreq_server *s)
> > +{
> > +    struct p2m_domain *p2m = p2m_get_hostp2m(d);
> > +
> > +    spin_lock(&p2m->ioreq.lock);
> > +
> > +    if ( p2m->ioreq.server == s )
> > +    {
> > +        p2m->ioreq.server = NULL;
> > +        p2m->ioreq.flags = 0;
> > +    }
> > +
> > +    p2m_memory_type_changed(d);
> 
> Do we really want to call this unconditionally, even if the server being
> destroyed here isn't the server associated with p2m_ioreq_server?  It
> shouldn't have a functional impact, but it will slow things down as the
> ept tables are unnecessarily rebuilt.
> 

True. That should be conditional.

> > diff --git a/xen/include/public/hvm/hvm_op.h
> b/xen/include/public/hvm/hvm_op.h
> > index a1eae52..e7fc75d 100644
> > --- a/xen/include/public/hvm/hvm_op.h
> > +++ b/xen/include/public/hvm/hvm_op.h
> > @@ -489,6 +489,30 @@ struct xen_hvm_altp2m_op {
> >  typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
> >  DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
> >
> > +#if defined(__XEN__) || defined(__XEN_TOOLS__)
> > +
> > +#define HVMOP_map_mem_type_to_ioreq_server 26
> > +struct xen_hvm_map_mem_type_to_ioreq_server {
> > +    domid_t domid;      /* IN - domain to be serviced */
> > +    ioservid_t id;      /* IN - server id */
> > +    hvmmem_type_t type; /* IN - memory type */
> 
> What is this 'type' for?  Do you expect to map other types of memory
> this way in the future?
> 

As stated in the design, yes.

  Paul

>  -George
Jan Beulich March 23, 2016, 9:22 a.m. UTC | #3
>>> On 22.03.16 at 18:51, <Paul.Durrant@citrix.com> wrote:
>> From: George Dunlap [mailto:george.dunlap@citrix.com]
>> Sent: 22 March 2016 17:27
>> There's not much documentation in the code about how this is expected to
>> be used.
>> 
>> For instance, having separate flags seems to imply that you can
>> independently select either read intercept, write intercept, or both;
>> but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
>> WRITE_ACCESS.  Do you plan to implement them separately in the future?
>> If not, would it be better to make the interface an enum instead?
>> 
>> At very least it should be documented that READ_ACCESS implies
>> WRITE_ACCESS.
> 
> That's not true. If WRITE_ACCESS has not been requested then writes are 
> handled directly in Xen rather than being forwarded to the ioreq server. If 
> h/w were to allow pages to be marked write-only then we wouldn't need to do 
> that.

At least on EPT iirc you can have write-only pages, due to there
being both an R and a W bit.

>> > +    p2m_memory_type_changed(d);
>> > +
>> > +    rc = 0;
>> > +
>> > +out:

Btw: label placement.

Jan
Yu Zhang March 23, 2016, 9:44 a.m. UTC | #4
Thanks, Jan.

On 3/23/2016 5:22 PM, Jan Beulich wrote:
>>>> On 22.03.16 at 18:51, <Paul.Durrant@citrix.com> wrote:
>>> From: George Dunlap [mailto:george.dunlap@citrix.com]
>>> Sent: 22 March 2016 17:27
>>> There's not much documentation in the code about how this is expected to
>>> be used.
>>>
>>> For instance, having separate flags seems to imply that you can
>>> independently select either read intercept, write intercept, or both;
>>> but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
>>> WRITE_ACCESS.  Do you plan to implement them separately in the future?
>>> If not, would it be better to make the interface an enum instead?
>>>
>>> At very least it should be documented that READ_ACCESS implies
>>> WRITE_ACCESS.
>>
>> That's not true. If WRITE_ACCESS has not been requested then writes are
>> handled directly in Xen rather than being forwarded to the ioreq server. If
>> h/w were to allow pages to be marked write-only then we wouldn't need to do
>> that.
>
> At least on EPT iirc you can have write-only pages, due to there
> being both an R and a W bit.
>

Sorry for not having clearly explained the flags parameter in this new
HVMOP. Turning on the P2M_IOREQ_HANDLE_READ_ACCESS or the
P2M_IOREQ_HANDLE_WRITE_ACCESS in flags only suggests that read/write
operations are supposed to be handled by one specific ioreq server,
which means this operation needs trap to the hypervisor, but not the
logic does not hold on the other way round, e.g. depriving the write
access right does not necessarily means write operation will be
forwarded to an ioreq server.

Besides, iiuc, setting the W bit to 1 with R bit cleared in EPT pte is
not really giving the write access right to the guest, and will trigger
ept misconfiguration.

>>>> +    p2m_memory_type_changed(d);
>>>> +
>>>> +    rc = 0;
>>>> +
>>>> +out:
>
> Btw: label placement.
>
> Jan
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel
>



B.R.
Yu
Jan Beulich March 23, 2016, 10:10 a.m. UTC | #5
>>> On 23.03.16 at 10:44, <yu.c.zhang@linux.intel.com> wrote:
> Besides, iiuc, setting the W bit to 1 with R bit cleared in EPT pte is
> not really giving the write access right to the guest, and will trigger
> ept misconfiguration.

Oh, indeed. Quite odd.

Jan
Andrew Cooper March 23, 2016, 10:37 a.m. UTC | #6
On 23/03/16 09:22, Jan Beulich wrote:
>>>> On 22.03.16 at 18:51, <Paul.Durrant@citrix.com> wrote:
>>> From: George Dunlap [mailto:george.dunlap@citrix.com]
>>> Sent: 22 March 2016 17:27
>>> There's not much documentation in the code about how this is expected to
>>> be used.
>>>
>>> For instance, having separate flags seems to imply that you can
>>> independently select either read intercept, write intercept, or both;
>>> but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
>>> WRITE_ACCESS.  Do you plan to implement them separately in the future?
>>> If not, would it be better to make the interface an enum instead?
>>>
>>> At very least it should be documented that READ_ACCESS implies
>>> WRITE_ACCESS.
>> That's not true. If WRITE_ACCESS has not been requested then writes are 
>> handled directly in Xen rather than being forwarded to the ioreq server. If 
>> h/w were to allow pages to be marked write-only then we wouldn't need to do 
>> that.
> At least on EPT iirc you can have write-only pages, due to there
> being both an R and a W bit.

W and WX pages in EPT cause a MISCONFIG.  All other combinations however
are usable.

~Andrew
George Dunlap March 23, 2016, 11:43 a.m. UTC | #7
On 22/03/16 17:51, Paul Durrant wrote:
>> -----Original Message-----
>> From: George Dunlap [mailto:george.dunlap@citrix.com]
>> Sent: 22 March 2016 17:27
>> To: Yu Zhang; xen-devel@lists.xen.org
>> Cc: Keir (Xen.org); jbeulich@suse.com; Andrew Cooper; George Dunlap;
>> jun.nakajima@intel.com; Kevin Tian; Tim (Xen.org); Paul Durrant;
>> zhiyuan.lv@intel.com
>> Subject: Re: [PATCH 3/3] Add HVMOP to map guest ram with
>> p2m_ioreq_server to an ioreq server
>>
>> On 16/03/16 12:22, Yu Zhang wrote:
>>> A new HVMOP - HVMOP_map_mem_type_to_ioreq_server, is added to
>>> let one ioreq server claim its responsibility for the handling
>>> of guest pages with p2m type p2m_ioreq_server. Users of this
>>> HVMOP can specify whether the p2m_ioreq_server is supposed to
>>> handle write accesses or read ones in a flag. By now, we only
>>> support one ioreq server for this p2m type, so once an ioreq
>>> server has claimed its ownership, following calls of the
>>> HVMOP_map_mem_type_to_ioreq_server will fail.
>>>
>>> Note: this HVMOP does not change the p2m type of any guest ram
>>> page, until the HVMOP_set_mem_type is triggered. So normally
>>> the steps would be the backend driver first claims its ownership
>>> of guest ram pages with p2m_ioreq_server type. At then sets the
>>> memory type to p2m_ioreq_server for specified guest ram pages.
>>
>> Yu, thanks for this work.  I think it's heading in the right direction.
>>
>> A couple of comments:
>>
>> There's not much documentation in the code about how this is expected to
>> be used.
>>
>> For instance, having separate flags seems to imply that you can
>> independently select either read intercept, write intercept, or both;
>> but [ept_]p2m_type_to_flags() seems to assume that READ_ACCESS implies
>> WRITE_ACCESS.  Do you plan to implement them separately in the future?
>> If not, would it be better to make the interface an enum instead?
>>
>> At very least it should be documented that READ_ACCESS implies
>> WRITE_ACCESS.
> 
> That's not true. If WRITE_ACCESS has not been requested then writes are handled directly in Xen rather than being forwarded to the ioreq server. If h/w were to allow pages to be marked write-only then we wouldn't need to do that.

Right -- well this at least needs to be in the commit message, and it
would be good if it were in a comment somewhere as well.

It's probably also useful to note in the interface that if you're only
intercepting writes, reads will happen at full speed; but that if you're
only intercepting reads, writes must be emulated (and so there will have
a significant performance impact).

>>> @@ -168,13 +226,65 @@ static int hvmemul_do_io(
>>>          break;
>>>      case X86EMUL_UNHANDLEABLE:
>>>      {
>>> -        struct hvm_ioreq_server *s =
>>> -            hvm_select_ioreq_server(curr->domain, &p);
>>> +        struct hvm_ioreq_server *s;
>>> +        p2m_type_t p2mt;
>>> +
>>> +        if ( is_mmio )
>>> +        {
>>> +            unsigned long gmfn = paddr_to_pfn(addr);
>>> +
>>> +            (void) get_gfn_query_unlocked(currd, gmfn, &p2mt);
>>> +
>>> +            switch ( p2mt )
>>> +            {
>>> +                case p2m_ioreq_server:
>>> +                {
>>> +                    unsigned long flags;
>>> +
>>> +                    p2m_get_ioreq_server(currd, p2mt, &flags, &s);
>>> +
>>> +                    if ( !s )
>>> +                        break;
>>> +
>>> +                    if ( (dir == IOREQ_READ &&
>>> +                          !(flags & P2M_IOREQ_HANDLE_READ_ACCESS)) ||
>>> +                         (dir == IOREQ_WRITE &&
>>> +                          !(flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) )
>>> +                        s = NULL;
>>> +
>>> +                    break;
>>> +                }
>>> +                case p2m_ram_rw:
>>> +                    s = NULL;
>>> +                    break;
>>> +
>>> +                default:
>>> +                    s = hvm_select_ioreq_server(currd, &p);
>>> +                    break;
>>> +            }
>>> +        }
>>> +        else
>>> +        {
>>> +            p2mt = p2m_invalid;
>>> +
>>> +            s = hvm_select_ioreq_server(currd, &p);
>>> +        }
>>>
>>>          /* If there is no suitable backing DM, just ignore accesses */
>>>          if ( !s )
>>>          {
>>> -            rc = hvm_process_io_intercept(&null_handler, &p);
>>> +            switch ( p2mt )
>>> +            {
>>> +            case p2m_ioreq_server:
>>> +            case p2m_ram_rw:
>>> +                rc = hvm_process_io_intercept(&mem_handler, &p);
>>> +                break;
>>> +
>>> +            default:
>>> +                rc = hvm_process_io_intercept(&null_handler, &p);
>>> +                break;
>>> +            }
>>
>> Is it actually possible to get here with "is_mmio" true and p2mt ==
>> p2m_ram_rw?
> 
> I think that's possible if the type change races with an access.

OK -- a brief comment about that would be helpful.

Thanks,
 -George
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index ddc8007..ace23ba 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -94,11 +94,69 @@  static const struct hvm_io_handler null_handler = {
     .ops = &null_ops
 };
 
+static int mem_read(const struct hvm_io_handler *io_handler,
+                    uint64_t addr,
+                    uint32_t size,
+                    uint64_t *data)
+{
+    struct domain *currd = current->domain;
+    unsigned long gmfn = paddr_to_pfn(addr);
+    unsigned long offset = addr & ~PAGE_MASK;
+    struct page_info *page = get_page_from_gfn(currd, gmfn, NULL, P2M_UNSHARE);
+    uint8_t *p;
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    p = __map_domain_page(page);
+    p += offset;
+    memcpy(data, p, size);
+
+    unmap_domain_page(p);
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
+static int mem_write(const struct hvm_io_handler *handler,
+                     uint64_t addr,
+                     uint32_t size,
+                     uint64_t data)
+{
+    struct domain *currd = current->domain;
+    unsigned long gmfn = paddr_to_pfn(addr);
+    unsigned long offset = addr & ~PAGE_MASK;
+    struct page_info *page = get_page_from_gfn(currd, gmfn, NULL, P2M_UNSHARE);
+    uint8_t *p;
+
+    if ( !page )
+        return X86EMUL_UNHANDLEABLE;
+
+    p = __map_domain_page(page);
+    p += offset;
+    memcpy(p, &data, size);
+
+    unmap_domain_page(p);
+    put_page(page);
+
+    return X86EMUL_OKAY;
+}
+
+static const struct hvm_io_ops mem_ops = {
+    .read = mem_read,
+    .write = mem_write
+};
+
+static const struct hvm_io_handler mem_handler = {
+    .ops = &mem_ops
+};
+
 static int hvmemul_do_io(
     bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
     uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
 {
     struct vcpu *curr = current;
+    struct domain *currd = curr->domain;
     struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io;
     ioreq_t p = {
         .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
@@ -140,7 +198,7 @@  static int hvmemul_do_io(
              (p.dir != dir) ||
              (p.df != df) ||
              (p.data_is_ptr != data_is_addr) )
-            domain_crash(curr->domain);
+            domain_crash(currd);
 
         if ( data_is_addr )
             return X86EMUL_UNHANDLEABLE;
@@ -168,13 +226,65 @@  static int hvmemul_do_io(
         break;
     case X86EMUL_UNHANDLEABLE:
     {
-        struct hvm_ioreq_server *s =
-            hvm_select_ioreq_server(curr->domain, &p);
+        struct hvm_ioreq_server *s;
+        p2m_type_t p2mt;
+
+        if ( is_mmio )
+        {
+            unsigned long gmfn = paddr_to_pfn(addr);
+
+            (void) get_gfn_query_unlocked(currd, gmfn, &p2mt);
+
+            switch ( p2mt )
+            {
+                case p2m_ioreq_server:
+                {
+                    unsigned long flags;
+
+                    p2m_get_ioreq_server(currd, p2mt, &flags, &s);
+
+                    if ( !s )
+                        break;
+
+                    if ( (dir == IOREQ_READ &&
+                          !(flags & P2M_IOREQ_HANDLE_READ_ACCESS)) ||
+                         (dir == IOREQ_WRITE &&
+                          !(flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) )
+                        s = NULL;
+
+                    break;
+                }
+                case p2m_ram_rw:
+                    s = NULL;
+                    break;
+
+                default:
+                    s = hvm_select_ioreq_server(currd, &p);
+                    break;
+            }
+        }
+        else
+        {
+            p2mt = p2m_invalid;
+
+            s = hvm_select_ioreq_server(currd, &p);
+        }
 
         /* If there is no suitable backing DM, just ignore accesses */
         if ( !s )
         {
-            rc = hvm_process_io_intercept(&null_handler, &p);
+            switch ( p2mt )
+            {
+            case p2m_ioreq_server:
+            case p2m_ram_rw:
+                rc = hvm_process_io_intercept(&mem_handler, &p);
+                break;
+
+            default:
+                rc = hvm_process_io_intercept(&null_handler, &p);
+                break;
+            }
+
             vio->io_req.state = STATE_IOREQ_NONE;
         }
         else
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 07eee4a..29ddbc4 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -1252,6 +1252,8 @@  static int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
 
         domain_pause(d);
 
+        p2m_destroy_ioreq_server(d, s);
+
         hvm_ioreq_server_disable(s, 0);
 
         list_del(&s->list_entry);
@@ -1411,6 +1413,55 @@  static int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
     return rc;
 }
 
+static int hvm_map_mem_type_to_ioreq_server(struct domain *d,
+                                            ioservid_t id,
+                                            hvmmem_type_t type,
+                                            uint32_t flags)
+{
+    struct hvm_ioreq_server *s;
+    p2m_type_t p2mt;
+    int rc;
+
+    switch ( type )
+    {
+    case HVMMEM_ioreq_server:
+        p2mt = p2m_ioreq_server;
+        break;
+
+    default:
+        return -EINVAL;
+    }
+
+    if ( flags & ~(HVMOP_IOREQ_MEM_ACCESS_READ |
+                   HVMOP_IOREQ_MEM_ACCESS_WRITE) )
+        return -EINVAL;
+
+    spin_lock(&d->arch.hvm_domain.ioreq_server.lock);
+
+    rc = -ENOENT;
+    list_for_each_entry ( s,
+                          &d->arch.hvm_domain.ioreq_server.list,
+                          list_entry )
+    {
+        if ( s == d->arch.hvm_domain.default_ioreq_server )
+            continue;
+
+        if ( s->id == id )
+        {
+            rc = p2m_set_ioreq_server(d, p2mt, flags, s);
+            if ( rc )
+                break;
+
+            gdprintk(XENLOG_DEBUG, "%u claimed type p2m_ioreq_server\n",
+                     s->id);
+        }
+    }
+
+    spin_unlock(&d->arch.hvm_domain.ioreq_server.lock);
+
+    return rc;
+}
+
 static int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
                                       bool_t enabled)
 {
@@ -3174,9 +3225,9 @@  int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla,
      * If this GFN is emulated MMIO or marked as read-only, pass the fault
      * to the mmio handler.
      */
-    if ( (p2mt == p2m_mmio_dm) || 
-         (npfec.write_access &&
-          (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) )
+    if ( (p2mt == p2m_mmio_dm) ||
+         (p2mt == p2m_ioreq_server) ||
+         (npfec.write_access && p2m_is_discard_write(p2mt)) )
     {
         __put_gfn(p2m, gfn);
         if ( ap2m_active )
@@ -5989,6 +6040,36 @@  static int hvmop_unmap_io_range_from_ioreq_server(
     return rc;
 }
 
+static int hvmop_map_mem_type_to_ioreq_server(
+    XEN_GUEST_HANDLE_PARAM(xen_hvm_map_mem_type_to_ioreq_server_t) uop)
+{
+    xen_hvm_map_mem_type_to_ioreq_server_t op;
+    struct domain *d;
+    int rc;
+
+    if ( copy_from_guest(&op, uop, 1) )
+        return -EFAULT;
+
+    rc = rcu_lock_remote_domain_by_id(op.domid, &d);
+    if ( rc != 0 )
+        return rc;
+
+    rc = -EINVAL;
+    if ( !is_hvm_domain(d) )
+        goto out;
+
+    rc = xsm_hvm_ioreq_server(XSM_DM_PRIV, d,
+                              HVMOP_map_mem_type_to_ioreq_server);
+    if ( rc != 0 )
+        goto out;
+
+    rc = hvm_map_mem_type_to_ioreq_server(d, op.id, op.type, op.flags);
+
+ out:
+    rcu_unlock_domain(d);
+    return rc;
+}
+
 static int hvmop_set_ioreq_server_state(
     XEN_GUEST_HANDLE_PARAM(xen_hvm_set_ioreq_server_state_t) uop)
 {
@@ -6754,6 +6835,11 @@  long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg)
             guest_handle_cast(arg, xen_hvm_io_range_t));
         break;
 
+    case HVMOP_map_mem_type_to_ioreq_server:
+        rc = hvmop_map_mem_type_to_ioreq_server(
+            guest_handle_cast(arg, xen_hvm_map_mem_type_to_ioreq_server_t));
+        break;
+
     case HVMOP_set_ioreq_server_state:
         rc = hvmop_set_ioreq_server_state(
             guest_handle_cast(arg, xen_hvm_set_ioreq_server_state_t));
diff --git a/xen/arch/x86/mm/hap/nested_hap.c b/xen/arch/x86/mm/hap/nested_hap.c
index 9cee5a0..bbb6d85 100644
--- a/xen/arch/x86/mm/hap/nested_hap.c
+++ b/xen/arch/x86/mm/hap/nested_hap.c
@@ -174,7 +174,7 @@  nestedhap_walk_L0_p2m(struct p2m_domain *p2m, paddr_t L1_gpa, paddr_t *L0_gpa,
     if ( *p2mt == p2m_mmio_direct )
         goto direct_mmio_out;
     rc = NESTEDHVM_PAGEFAULT_MMIO;
-    if ( *p2mt == p2m_mmio_dm )
+    if ( *p2mt == p2m_mmio_dm || *p2mt == p2m_ioreq_server )
         goto out;
 
     rc = NESTEDHVM_PAGEFAULT_L0_ERROR;
diff --git a/xen/arch/x86/mm/p2m-ept.c b/xen/arch/x86/mm/p2m-ept.c
index 380ec25..21e04ce 100644
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -132,6 +132,14 @@  static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
             entry->r = entry->w = entry->x = 1;
             entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
+        case p2m_ioreq_server:
+            entry->r = !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS);
+            entry->w = (entry->r &
+                        !(p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS));
+            entry->x = entry->r;
+            entry->a = !!cpu_has_vmx_ept_ad;
+            entry->d = 0;
+            break;
         case p2m_mmio_direct:
             entry->r = entry->x = 1;
             entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
@@ -171,7 +179,6 @@  static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
             entry->a = entry->d = !!cpu_has_vmx_ept_ad;
             break;
         case p2m_grant_map_ro:
-        case p2m_ioreq_server:
             entry->r = 1;
             entry->w = entry->x = 0;
             entry->a = !!cpu_has_vmx_ept_ad;
diff --git a/xen/arch/x86/mm/p2m-pt.c b/xen/arch/x86/mm/p2m-pt.c
index eabd2e3..7a0ddb8 100644
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -72,8 +72,8 @@  static const unsigned long pgt[] = {
     PGT_l3_page_table
 };
 
-static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
-                                       unsigned int level)
+static unsigned long p2m_type_to_flags(struct p2m_domain *p2m, p2m_type_t t,
+                                       mfn_t mfn, unsigned int level)
 {
     unsigned long flags;
     /*
@@ -94,8 +94,18 @@  static unsigned long p2m_type_to_flags(p2m_type_t t, mfn_t mfn,
     default:
         return flags | _PAGE_NX_BIT;
     case p2m_grant_map_ro:
-    case p2m_ioreq_server:
         return flags | P2M_BASE_FLAGS | _PAGE_NX_BIT;
+    case p2m_ioreq_server:
+    {
+        flags |= P2M_BASE_FLAGS | _PAGE_RW;
+
+        if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS )
+            return flags & ~(_PAGE_PRESENT | _PAGE_RW);
+        else if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS )
+            return flags & ~_PAGE_RW;
+        else
+            return flags;
+    }
     case p2m_ram_ro:
     case p2m_ram_logdirty:
     case p2m_ram_shared:
@@ -442,7 +452,8 @@  static int do_recalc(struct p2m_domain *p2m, unsigned long gfn)
             p2m_type_t p2mt = p2m_is_logdirty_range(p2m, gfn & mask, gfn | ~mask)
                               ? p2m_ram_logdirty : p2m_ram_rw;
             unsigned long mfn = l1e_get_pfn(e);
-            unsigned long flags = p2m_type_to_flags(p2mt, _mfn(mfn), level);
+            unsigned long flags = p2m_type_to_flags(p2m, p2mt,
+                                                    _mfn(mfn), level);
 
             if ( level )
             {
@@ -579,7 +590,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         l3e_content = mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt)
             ? l3e_from_pfn(mfn_x(mfn),
-                           p2m_type_to_flags(p2mt, mfn, 2) | _PAGE_PSE)
+                           p2m_type_to_flags(p2m, p2mt, mfn, 2) | _PAGE_PSE)
             : l3e_empty();
         entry_content.l1 = l3e_content.l3;
 
@@ -615,7 +626,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
 
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             entry_content = p2m_l1e_from_pfn(mfn_x(mfn),
-                                             p2m_type_to_flags(p2mt, mfn, 0));
+                                         p2m_type_to_flags(p2m, p2mt, mfn, 0));
         else
             entry_content = l1e_empty();
 
@@ -651,7 +662,7 @@  p2m_pt_set_entry(struct p2m_domain *p2m, unsigned long gfn, mfn_t mfn,
         ASSERT(!mfn_valid(mfn) || p2mt != p2m_mmio_direct);
         if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
             l2e_content = l2e_from_pfn(mfn_x(mfn),
-                                       p2m_type_to_flags(p2mt, mfn, 1) |
+                                       p2m_type_to_flags(p2m, p2mt, mfn, 1) |
                                        _PAGE_PSE);
         else
             l2e_content = l2e_empty();
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c
index b3fce1b..2f9a6d9 100644
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -83,6 +83,8 @@  static int p2m_initialise(struct domain *d, struct p2m_domain *p2m)
     else
         p2m_pt_init(p2m);
 
+    spin_lock_init(&p2m->ioreq.lock);
+
     return ret;
 }
 
@@ -289,6 +291,83 @@  void p2m_memory_type_changed(struct domain *d)
     }
 }
 
+int p2m_set_ioreq_server(struct domain *d, p2m_type_t t,
+                         unsigned long flags,
+                         struct hvm_ioreq_server *s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+    int rc;
+
+    spin_lock(&p2m->ioreq.lock);
+
+    rc = -EINVAL;
+    if ( t != p2m_ioreq_server )
+        goto out;
+
+    rc = -EBUSY;
+    if ( ( p2m->ioreq.server != NULL ) && (flags != 0) )
+        goto out;
+
+    if ( flags == 0 )
+    {
+        p2m->ioreq.server = NULL;
+        p2m->ioreq.flags = 0;
+    }
+    else
+    {
+        p2m->ioreq.server = s;
+        p2m->ioreq.flags = flags;
+    }
+
+    p2m_memory_type_changed(d);
+
+    rc = 0;
+
+out:
+    spin_unlock(&p2m->ioreq.lock);
+
+    return rc;
+}
+
+void p2m_get_ioreq_server(struct domain *d, p2m_type_t t,
+                          unsigned long *flags,
+                          struct hvm_ioreq_server **s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    if ( t != p2m_ioreq_server )
+    {
+        *s = NULL;
+        *flags = 0;
+        return;
+    }
+
+    spin_lock(&p2m->ioreq.lock);
+
+    *s = p2m->ioreq.server;
+    *flags = p2m->ioreq.flags;
+
+    spin_unlock(&p2m->ioreq.lock);
+}
+
+void p2m_destroy_ioreq_server(struct domain *d,
+                              struct hvm_ioreq_server *s)
+{
+    struct p2m_domain *p2m = p2m_get_hostp2m(d);
+
+    spin_lock(&p2m->ioreq.lock);
+
+    if ( p2m->ioreq.server == s )
+    {
+        p2m->ioreq.server = NULL;
+        p2m->ioreq.flags = 0;
+    }
+
+    p2m_memory_type_changed(d);
+
+    spin_unlock(&p2m->ioreq.lock);
+}
+
 void p2m_enable_hardware_log_dirty(struct domain *d)
 {
     struct p2m_domain *p2m = p2m_get_hostp2m(d);
diff --git a/xen/arch/x86/mm/shadow/multi.c b/xen/arch/x86/mm/shadow/multi.c
index c81302a..c836517 100644
--- a/xen/arch/x86/mm/shadow/multi.c
+++ b/xen/arch/x86/mm/shadow/multi.c
@@ -530,6 +530,7 @@  _sh_propagate(struct vcpu *v,
     guest_l1e_t guest_entry = { guest_intpte };
     shadow_l1e_t *sp = shadow_entry_ptr;
     struct domain *d = v->domain;
+    struct p2m_domain *p2m = d->arch.p2m;
     struct sh_dirty_vram *dirty_vram = d->arch.hvm_domain.dirty_vram;
     gfn_t target_gfn = guest_l1e_get_gfn(guest_entry);
     u32 pass_thru_flags;
@@ -695,6 +696,15 @@  _sh_propagate(struct vcpu *v,
             sflags &= ~_PAGE_RW;
     }
 
+    if ( p2mt == p2m_ioreq_server )
+    {
+        if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS )
+            sflags &= ~_PAGE_RW;
+
+        if ( p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS )
+            sflags &= ~_PAGE_PRESENT;
+    }
+
     /* Read-only memory */
     if ( p2m_is_readonly(p2mt) )
         sflags &= ~_PAGE_RW;
@@ -2855,6 +2865,7 @@  static int sh_page_fault(struct vcpu *v,
                           struct cpu_user_regs *regs)
 {
     struct domain *d = v->domain;
+    struct p2m_domain *p2m = d->arch.p2m;
     walk_t gw;
     gfn_t gfn = _gfn(0);
     mfn_t gmfn, sl1mfn = _mfn(0);
@@ -3224,13 +3235,24 @@  static int sh_page_fault(struct vcpu *v,
     }
 
     /* Need to hand off device-model MMIO to the device model */
-    if ( p2mt == p2m_mmio_dm
-         || (p2mt == p2m_ioreq_server && ft == ft_demand_write) )
+    if ( p2mt == p2m_mmio_dm )
     {
         gpa = guest_walk_to_gpa(&gw);
         goto mmio;
     }
 
+    if ( p2mt == p2m_ioreq_server )
+    {
+        if ( (ft == ft_demand_write &&
+              (p2m->ioreq.flags & P2M_IOREQ_HANDLE_WRITE_ACCESS)) ||
+             (ft == ft_demand_read &&
+              (p2m->ioreq.flags & P2M_IOREQ_HANDLE_READ_ACCESS)) )
+        {
+            gpa = guest_walk_to_gpa(&gw);
+            goto mmio;
+        }
+    }
+
     /* Ignore attempts to write to read-only memory. */
     if ( p2m_is_readonly(p2mt) && (ft == ft_demand_write) )
     {
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h
index 084a1f2..3076fa8 100644
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -111,8 +111,7 @@  typedef unsigned int p2m_query_t;
 #define P2M_RO_TYPES (p2m_to_mask(p2m_ram_logdirty)     \
                       | p2m_to_mask(p2m_ram_ro)         \
                       | p2m_to_mask(p2m_grant_map_ro)   \
-                      | p2m_to_mask(p2m_ram_shared)     \
-                      | p2m_to_mask(p2m_ioreq_server))
+                      | p2m_to_mask(p2m_ram_shared))
 
 /* Write-discard types, which should discard the write operations */
 #define P2M_DISCARD_WRITE_TYPES (p2m_to_mask(p2m_ram_ro)     \
@@ -321,6 +320,16 @@  struct p2m_domain {
         struct ept_data ept;
         /* NPT-equivalent structure could be added here. */
     };
+
+    struct {
+        spinlock_t lock;
+        struct hvm_ioreq_server *server;
+        unsigned long flags;
+
+#define P2M_IOREQ_HANDLE_WRITE_ACCESS HVMOP_IOREQ_MEM_ACCESS_WRITE
+#define P2M_IOREQ_HANDLE_READ_ACCESS  HVMOP_IOREQ_MEM_ACCESS_READ
+
+    } ioreq;
 };
 
 /* get host p2m table */
@@ -822,6 +831,12 @@  static inline unsigned int p2m_get_iommu_flags(p2m_type_t p2mt)
     return flags;
 }
 
+int p2m_set_ioreq_server(struct domain *d, p2m_type_t t, unsigned long flags,
+                         struct hvm_ioreq_server *s);
+void p2m_get_ioreq_server(struct domain *d, p2m_type_t t, unsigned long *flags,
+                         struct hvm_ioreq_server **s);
+void p2m_destroy_ioreq_server(struct domain *d, struct hvm_ioreq_server *s);
+
 #endif /* _XEN_P2M_H */
 
 /*
diff --git a/xen/include/public/hvm/hvm_op.h b/xen/include/public/hvm/hvm_op.h
index a1eae52..e7fc75d 100644
--- a/xen/include/public/hvm/hvm_op.h
+++ b/xen/include/public/hvm/hvm_op.h
@@ -489,6 +489,30 @@  struct xen_hvm_altp2m_op {
 typedef struct xen_hvm_altp2m_op xen_hvm_altp2m_op_t;
 DEFINE_XEN_GUEST_HANDLE(xen_hvm_altp2m_op_t);
 
+#if defined(__XEN__) || defined(__XEN_TOOLS__)
+
+#define HVMOP_map_mem_type_to_ioreq_server 26
+struct xen_hvm_map_mem_type_to_ioreq_server {
+    domid_t domid;      /* IN - domain to be serviced */
+    ioservid_t id;      /* IN - server id */
+    hvmmem_type_t type; /* IN - memory type */
+    uint32_t flags;     /* IN - types of access to be
+                           intercepted */
+#define _HVMOP_IOREQ_MEM_ACCESS_READ 0
+#define HVMOP_IOREQ_MEM_ACCESS_READ \
+    (1u << _HVMOP_IOREQ_MEM_ACCESS_READ)
+
+#define _HVMOP_IOREQ_MEM_ACCESS_WRITE 1
+#define HVMOP_IOREQ_MEM_ACCESS_WRITE \
+    (1u << _HVMOP_IOREQ_MEM_ACCESS_WRITE)
+};
+typedef struct xen_hvm_map_mem_type_to_ioreq_server
+    xen_hvm_map_mem_type_to_ioreq_server_t;
+DEFINE_XEN_GUEST_HANDLE(xen_hvm_map_mem_type_to_ioreq_server_t);
+
+#endif /* defined(__XEN__) || defined(__XEN_TOOLS__) */
+
+
 #endif /* __XEN_PUBLIC_HVM_HVM_OP_H__ */
 
 /*