diff mbox series

[RFC,1/1] xen: Use a global mapping for runstate

Message ID 03e7cd740922bfbaa479f22d81d9de06f718a305.1590675919.git.bertrand.marquis@arm.com (mailing list archive)
State New, archived
Headers show
Series Runstate error with KPTI | expand

Commit Message

Bertrand Marquis May 28, 2020, 3:25 p.m. UTC
At the moment on Arm, a Linux guest running with KTPI enabled will
cause the following error when a context switch happens in user mode:
(XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0

This patch is modifying runstate handling to map the area given by the
guest inside Xen during the hypercall.
This is removing the guest virtual to physical conversion during context
switches which removes the bug and improve performance by preventing to
walk page tables during context switches.

--
In the current status, this patch is only working on Arm and needs to
be fixed on X86 (see #error on domain.c for missing get_page_from_gva).

Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
---
 xen/arch/arm/domain.c   | 32 +++++++++-------
 xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
 xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
 xen/include/xen/sched.h | 11 ++++--
 4 files changed, 124 insertions(+), 54 deletions(-)

Comments

Roger Pau Monné May 28, 2020, 4:53 p.m. UTC | #1
On Thu, May 28, 2020 at 04:25:31PM +0100, Bertrand Marquis wrote:
> At the moment on Arm, a Linux guest running with KTPI enabled will
> cause the following error when a context switch happens in user mode:
> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> 
> This patch is modifying runstate handling to map the area given by the
> guest inside Xen during the hypercall.
> This is removing the guest virtual to physical conversion during context
> switches which removes the bug and improve performance by preventing to
> walk page tables during context switches.
> 
> --
> In the current status, this patch is only working on Arm and needs to
> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
> 
> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
> ---
>  xen/arch/arm/domain.c   | 32 +++++++++-------
>  xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>  xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>  xen/include/xen/sched.h | 11 ++++--
>  4 files changed, 124 insertions(+), 54 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 31169326b2..799b0e0103 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>  /* Update per-VCPU guest runstate shared memory area (if registered). */
>  static void update_runstate_area(struct vcpu *v)
>  {
> -    void __user *guest_handle = NULL;
> -    struct vcpu_runstate_info runstate;
> +    struct vcpu_runstate_info *runstate;
>  
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> +    /* XXX why do we accept not to block here */
> +    if ( !spin_trylock(&v->runstate_guest_lock) )

IMO the runstate is not a crucial piece of information, so it's better
to context switch fast.

>          return;
>  
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +    runstate = runstate_guest(v);
> +
> +    if (runstate == NULL)

In general we don't explicitly check for NULL, and you could write
this as:

    if ( runstate ) ...

Note the adding spaces between parentheses and the condition. I would
also likely check runstate_guest(v) directly and assign to runstate
afterwards if it's set.

> +    {
> +        spin_unlock(&v->runstate_guest_lock);
> +        return;
> +    }
>  
>      if ( VM_ASSIST(v->domain, runstate_update_flag) )
>      {
> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> -        guest_handle--;
> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>          smp_wmb();
> +        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>      }
>  
> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> +    memcpy(runstate, &v->runstate, sizeof(v->runstate));
>  
> -    if ( guest_handle )
> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>      {
> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>          smp_wmb();

I think you need the barrier before clearing XEN_RUNSTATE_UPDATE from
the guest version of the runstate info, to make sure writes are not
re-ordered and hence that the XEN_RUNSTATE_UPDATE flag in the guest
version is not cleared before the full write has been committed?

> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>      }
> +
> +    spin_unlock(&v->runstate_guest_lock);
>  }
>  
>  static void schedule_tail(struct vcpu *prev)
> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
> index 6327ba0790..10c6ceb561 100644
> --- a/xen/arch/x86/domain.c
> +++ b/xen/arch/x86/domain.c
> @@ -1642,57 +1642,62 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
>  /* Update per-VCPU guest runstate shared memory area (if registered). */
>  bool update_runstate_area(struct vcpu *v)
>  {
> -    bool rc;
>      struct guest_memory_policy policy = { .nested_guest_mode = false };
> -    void __user *guest_handle = NULL;
> -    struct vcpu_runstate_info runstate;
> +    struct vcpu_runstate_info *runstate;
>  
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> +    /* XXX: should we return false ? */
> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>          return true;
>  
> -    update_guest_memory_policy(v, &policy);
> +    runstate = runstate_guest(v);
>  
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +    if (runstate == NULL)
> +    {
> +        spin_unlock(&v->runstate_guest_lock);
> +        return true;
> +    }
> +
> +    update_guest_memory_policy(v, &policy);
>  
>      if ( VM_ASSIST(v->domain, runstate_update_flag) )
>      {
> -        guest_handle = has_32bit_shinfo(v->domain)
> -            ? &v->runstate_guest.compat.p->state_entry_time + 1
> -            : &v->runstate_guest.native.p->state_entry_time + 1;
> -        guest_handle--;
> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>          smp_wmb();
> +        if (has_32bit_shinfo(v->domain))
> +            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
> +        else
> +            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;

I'm confused here, isn't runstate == v->runstate_guest.native at this
point?

I think you want to update v->runstate.state_entry_time here?

>      }
>  
>      if ( has_32bit_shinfo(v->domain) )
>      {
>          struct compat_vcpu_runstate_info info;
> -
>          XLAT_vcpu_runstate_info(&info, &runstate);
> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
> -        rc = true;
> +        memcpy(v->runstate_guest.compat, &info, 1);
>      }
>      else
> -        rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
> -             sizeof(runstate);
> +        memcpy(runstate, &v->runstate, sizeof(v->runstate));
>  
> -    if ( guest_handle )
> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>      {
> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>          smp_wmb();
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +        if (has_32bit_shinfo(v->domain))
> +            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
> +        else
> +            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;

Same comment here related to the usage of runstate_guest instead of
runstate.

>      }
>  
> +    spin_unlock(&v->runstate_guest_lock);
> +
>      update_guest_memory_policy(v, &policy);
>  
> -    return rc;
> +    return true;
>  }
>  
>  static void _update_runstate_area(struct vcpu *v)
>  {
> +    /* XXX: this should be removed */
>      if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
>           !(v->arch.flags & TF_kernel_mode) )
>          v->arch.pv.need_update_runstate_area = 1;
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7cc9526139..acc6f90ba3 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -161,6 +161,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>      v->dirty_cpu = VCPU_CPU_CLEAN;
>  
>      spin_lock_init(&v->virq_lock);
> +    spin_lock_init(&v->runstate_guest_lock);
>  
>      tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
>  
> @@ -691,6 +692,66 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>      return 0;
>  }
>  
> +static void  unmap_runstate_area(struct vcpu *v, unsigned int lock)

lock wants to be a bool here.

> +{
> +    mfn_t mfn;
> +
> +    if ( ! runstate_guest(v) )
> +        return;

I think you must check runstate_guest with the lock taken?

> +
> +    if (lock)
> +        spin_lock(&v->runstate_guest_lock);
> +
> +    mfn = domain_page_map_to_mfn(runstate_guest(v));
> +
> +    unmap_domain_page_global((void *)
> +                            ((unsigned long)v->runstate_guest &
> +                             PAGE_MASK));
> +
> +    put_page_and_type(mfn_to_page(mfn));
> +    runstate_guest(v) = NULL;
> +
> +    if (lock)
> +        spin_unlock(&v->runstate_guest_lock);
> +}
> +
> +static int map_runstate_area(struct vcpu *v,
> +                    struct vcpu_register_runstate_memory_area *area)
> +{
> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
> +    void *mapping;
> +    struct page_info *page;
> +    size_t size = sizeof(struct vcpu_runstate_info);
> +
> +    ASSERT(runstate_guest(v) == NULL);
> +
> +    /* do not allow an area crossing 2 pages */
> +    if ( offset > (PAGE_SIZE - size) )
> +        return -EINVAL;

I'm afraid this is not suitable, Linux will BUG if
VCPUOP_register_runstate_memory_area returns an error, and current
Linux code doesn't check that the area doesn't cross a page
boundary. You will need to take a reference to the possible two pages
in that case.

> +
> +#ifdef CONFIG_ARM
> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
> +#else
> +    /* XXX how to solve this one ? */

We have hvm_translate_get_page which seems similar, will need to look
into this.

> +#error get_page_from_gva is not available on other archs
> +#endif
> +    if ( !page )
> +        return -EINVAL;
> +
> +    mapping = __map_domain_page_global(page);
> +
> +    if ( mapping == NULL )
> +    {
> +        put_page_and_type(page);
> +        return -ENOMEM;
> +    }
> +
> +    runstate_guest(v) = (struct vcpu_runstate_info *)
> +        ((unsigned long)mapping + offset);

There's no need to cast to unsigned long, you can just do pointer
arithmetic on the void * directly. That should also get rid of the
cast to vcpu_runstate_info I think.

> +
> +    return 0;
> +}
> +
>  int domain_kill(struct domain *d)
>  {
>      int rc = 0;
> @@ -727,7 +788,10 @@ int domain_kill(struct domain *d)
>          if ( cpupool_move_domain(d, cpupool0) )
>              return -ERESTART;
>          for_each_vcpu ( d, v )
> +        {
> +            unmap_runstate_area(v, 0);

Why is it not appropriate here to hold the lock? It might not be
technically needed, but still should work?

Thanks, Roger.
Bertrand Marquis May 28, 2020, 5:19 p.m. UTC | #2
> On 28 May 2020, at 17:53, Roger Pau Monné <roger@xen.org> wrote:
> 
> On Thu, May 28, 2020 at 04:25:31PM +0100, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>> 
>> This patch is modifying runstate handling to map the area given by the
>> guest inside Xen during the hypercall.
>> This is removing the guest virtual to physical conversion during context
>> switches which removes the bug and improve performance by preventing to
>> walk page tables during context switches.
>> 
>> --
>> In the current status, this patch is only working on Arm and needs to
>> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
>> 
>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>> ---
>> xen/arch/arm/domain.c   | 32 +++++++++-------
>> xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>> xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>> xen/include/xen/sched.h | 11 ++++--
>> 4 files changed, 124 insertions(+), 54 deletions(-)
>> 
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 31169326b2..799b0e0103 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>> /* Update per-VCPU guest runstate shared memory area (if registered). */
>> static void update_runstate_area(struct vcpu *v)
>> {
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    struct vcpu_runstate_info *runstate;
>> 
>> -    if ( guest_handle_is_null(runstate_guest(v)) )
>> +    /* XXX why do we accept not to block here */
>> +    if ( !spin_trylock(&v->runstate_guest_lock) )
> 
> IMO the runstate is not a crucial piece of information, so it's better
> to context switch fast.

Ok I will add a comment there to explain that otherwise it is not obvious why simply ignore and continue.

> 
>>         return;
>> 
>> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    runstate = runstate_guest(v);
>> +
>> +    if (runstate == NULL)
> 
> In general we don't explicitly check for NULL, and you could write
> this as:
> 
>    if ( runstate ) ...
> 
> Note the adding spaces between parentheses and the condition. I would
> also likely check runstate_guest(v) directly and assign to runstate
> afterwards if it's set.

Ok

> 
>> +    {
>> +        spin_unlock(&v->runstate_guest_lock);
>> +        return;
>> +    }
>> 
>>     if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>     {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>         smp_wmb();
>> +        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>     }
>> 
>> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    memcpy(runstate, &v->runstate, sizeof(v->runstate));
>> 
>> -    if ( guest_handle )
>> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>     {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>         smp_wmb();
> 
> I think you need the barrier before clearing XEN_RUNSTATE_UPDATE from
> the guest version of the runstate info, to make sure writes are not
> re-ordered and hence that the XEN_RUNSTATE_UPDATE flag in the guest
> version is not cleared before the full write has been committed?

Very true. I will fix that.

> 
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>     }
>> +
>> +    spin_unlock(&v->runstate_guest_lock);
>> }
>> 
>> static void schedule_tail(struct vcpu *prev)
>> diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
>> index 6327ba0790..10c6ceb561 100644
>> --- a/xen/arch/x86/domain.c
>> +++ b/xen/arch/x86/domain.c
>> @@ -1642,57 +1642,62 @@ void paravirt_ctxt_switch_to(struct vcpu *v)
>> /* Update per-VCPU guest runstate shared memory area (if registered). */
>> bool update_runstate_area(struct vcpu *v)
>> {
>> -    bool rc;
>>     struct guest_memory_policy policy = { .nested_guest_mode = false };
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    struct vcpu_runstate_info *runstate;
>> 
>> -    if ( guest_handle_is_null(runstate_guest(v)) )
>> +    /* XXX: should we return false ? */
>> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>>         return true;
>> 
>> -    update_guest_memory_policy(v, &policy);
>> +    runstate = runstate_guest(v);
>> 
>> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    if (runstate == NULL)
>> +    {
>> +        spin_unlock(&v->runstate_guest_lock);
>> +        return true;
>> +    }
>> +
>> +    update_guest_memory_policy(v, &policy);
>> 
>>     if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>     {
>> -        guest_handle = has_32bit_shinfo(v->domain)
>> -            ? &v->runstate_guest.compat.p->state_entry_time + 1
>> -            : &v->runstate_guest.native.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>         smp_wmb();
>> +        if (has_32bit_shinfo(v->domain))
>> +            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +        else
>> +            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;
> 
> I'm confused here, isn't runstate == v->runstate_guest.native at this
> point?
> 
> I think you want to update v->runstate.state_entry_time here?

I will have to dig deeper on the x86 implementation on that part because the compatibility handling is not straight forward.
Currently if the compatibility mode is required both our internal and external copy of the runstate are in compatibility mode.

It might be simpler to only handle the compatibility conversion during the update_runstate_area instead of doing it everywhere ?
But maybe this should be a change for an other patch (if any).

> 
>>     }
>> 
>>     if ( has_32bit_shinfo(v->domain) )
>>     {
>>         struct compat_vcpu_runstate_info info;
>> -
>>         XLAT_vcpu_runstate_info(&info, &runstate);
>> -        __copy_to_guest(v->runstate_guest.compat, &info, 1);
>> -        rc = true;
>> +        memcpy(v->runstate_guest.compat, &info, 1);
>>     }
>>     else
>> -        rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
>> -             sizeof(runstate);
>> +        memcpy(runstate, &v->runstate, sizeof(v->runstate));
>> 
>> -    if ( guest_handle )
>> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>     {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>         smp_wmb();
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +        if (has_32bit_shinfo(v->domain))
>> +            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
>> +        else
>> +            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;
> 
> Same comment here related to the usage of runstate_guest instead of
> runstate.

Agree

> 
>>     }
>> 
>> +    spin_unlock(&v->runstate_guest_lock);
>> +
>>     update_guest_memory_policy(v, &policy);
>> 
>> -    return rc;
>> +    return true;
>> }
>> 
>> static void _update_runstate_area(struct vcpu *v)
>> {
>> +    /* XXX: this should be removed */
>>     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
>>          !(v->arch.flags & TF_kernel_mode) )
>>         v->arch.pv.need_update_runstate_area = 1;
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index 7cc9526139..acc6f90ba3 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -161,6 +161,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>>     v->dirty_cpu = VCPU_CPU_CLEAN;
>> 
>>     spin_lock_init(&v->virq_lock);
>> +    spin_lock_init(&v->runstate_guest_lock);
>> 
>>     tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
>> 
>> @@ -691,6 +692,66 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>>     return 0;
>> }
>> 
>> +static void  unmap_runstate_area(struct vcpu *v, unsigned int lock)
> 
> lock wants to be a bool here.
Ok I will fix that.

> 
>> +{
>> +    mfn_t mfn;
>> +
>> +    if ( ! runstate_guest(v) )
>> +        return;
> 
> I think you must check runstate_guest with the lock taken?

Right, I will fix that.

> 
>> +
>> +    if (lock)
>> +        spin_lock(&v->runstate_guest_lock);
>> +
>> +    mfn = domain_page_map_to_mfn(runstate_guest(v));
>> +
>> +    unmap_domain_page_global((void *)
>> +                            ((unsigned long)v->runstate_guest &
>> +                             PAGE_MASK));
>> +
>> +    put_page_and_type(mfn_to_page(mfn));
>> +    runstate_guest(v) = NULL;
>> +
>> +    if (lock)
>> +        spin_unlock(&v->runstate_guest_lock);
>> +}
>> +
>> +static int map_runstate_area(struct vcpu *v,
>> +                    struct vcpu_register_runstate_memory_area *area)
>> +{
>> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
>> +    void *mapping;
>> +    struct page_info *page;
>> +    size_t size = sizeof(struct vcpu_runstate_info);
>> +
>> +    ASSERT(runstate_guest(v) == NULL);
>> +
>> +    /* do not allow an area crossing 2 pages */
>> +    if ( offset > (PAGE_SIZE - size) )
>> +        return -EINVAL;
> 
> I'm afraid this is not suitable, Linux will BUG if
> VCPUOP_register_runstate_memory_area returns an error, and current
> Linux code doesn't check that the area doesn't cross a page
> boundary. You will need to take a reference to the possible two pages
> in that case.

Ok, I will fix that.

> 
>> +
>> +#ifdef CONFIG_ARM
>> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
>> +#else
>> +    /* XXX how to solve this one ? */
> 
> We have hvm_translate_get_page which seems similar, will need to look
> into this.

Ok I will wait for more information from you on that one.

> 
>> +#error get_page_from_gva is not available on other archs
>> +#endif
>> +    if ( !page )
>> +        return -EINVAL;
>> +
>> +    mapping = __map_domain_page_global(page);
>> +
>> +    if ( mapping == NULL )
>> +    {
>> +        put_page_and_type(page);
>> +        return -ENOMEM;
>> +    }
>> +
>> +    runstate_guest(v) = (struct vcpu_runstate_info *)
>> +        ((unsigned long)mapping + offset);
> 
> There's no need to cast to unsigned long, you can just do pointer
> arithmetic on the void * directly. That should also get rid of the
> cast to vcpu_runstate_info I think.

Some compilers are not allowing arithmetics on void* and gcc will forbid it with -pedantic-errors that’s why I was use to write code like that.
I will fix that.

> 
>> +
>> +    return 0;
>> +}
>> +
>> int domain_kill(struct domain *d)
>> {
>>     int rc = 0;
>> @@ -727,7 +788,10 @@ int domain_kill(struct domain *d)
>>         if ( cpupool_move_domain(d, cpupool0) )
>>             return -ERESTART;
>>         for_each_vcpu ( d, v )
>> +        {
>> +            unmap_runstate_area(v, 0);
> 
> Why is it not appropriate here to hold the lock? It might not be
> technically needed, but still should work?

In a killing scenario you might stop a core while it was rescheduling.
Couldn’t a core be killed using a cross core interrupt ?
If this is the case then I would need to do masked interrupt locking sections to prevent that case ?

Thanks for the feedback.
Bertrand
Julien Grall May 28, 2020, 6:54 p.m. UTC | #3
Hi Bertrand,

Thank you for the patch.

On 28/05/2020 16:25, Bertrand Marquis wrote:
> At the moment on Arm, a Linux guest running with KTPI enabled will
> cause the following error when a context switch happens in user mode:
> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> 
> This patch is modifying runstate handling to map the area given by the
> guest inside Xen during the hypercall.
> This is removing the guest virtual to physical conversion during context
> switches which removes the bug

It would be good to spell out that a virtual address is not stable. So 
relying on it is wrong.

> and improve performance by preventing to
> walk page tables during context switches.

With Secret free hypervisor in mind, I would like to suggest to 
map/unmap the runstate during context switch.

The cost should be minimal when there is a direct map (i.e on Arm64 and 
x86) and still provide better performance on Arm32.

The change should be minimal compare to the current approach but this 
could be taken care separately if you don't have time.

> 
> --
> In the current status, this patch is only working on Arm and needs to
> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
> 
> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
> ---
>   xen/arch/arm/domain.c   | 32 +++++++++-------
>   xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>   xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>   xen/include/xen/sched.h | 11 ++++--
>   4 files changed, 124 insertions(+), 54 deletions(-)
> 
> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
> index 31169326b2..799b0e0103 100644
> --- a/xen/arch/arm/domain.c
> +++ b/xen/arch/arm/domain.c
> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>   /* Update per-VCPU guest runstate shared memory area (if registered). */
>   static void update_runstate_area(struct vcpu *v)
>   {
> -    void __user *guest_handle = NULL;
> -    struct vcpu_runstate_info runstate;
> +    struct vcpu_runstate_info *runstate;
>   
> -    if ( guest_handle_is_null(runstate_guest(v)) )
> +    /* XXX why do we accept not to block here */
> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>           return;
>   
> -    memcpy(&runstate, &v->runstate, sizeof(runstate));
> +    runstate = runstate_guest(v);
> +
> +    if (runstate == NULL)
> +    {
> +        spin_unlock(&v->runstate_guest_lock);
> +        return;
> +    }
>   
>       if ( VM_ASSIST(v->domain, runstate_update_flag) )
>       {
> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
> -        guest_handle--;
> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
> -        __raw_copy_to_guest(guest_handle,
> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>           smp_wmb();

Because you set v->runstate.state_entry_time below, the placement of the 
barrier seems a bit odd.

I would suggest to update v->runstate.state_entry_time first and then 
update runstate->state_entry_time.

> +        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>       }
>   
> -    __copy_to_guest(runstate_guest(v), &runstate, 1);
> +    memcpy(runstate, &v->runstate, sizeof(v->runstate));
>   
> -    if ( guest_handle )
> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>       {
> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
> +        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>           smp_wmb();

You want to update runstate->state_entry_time after the barrier not before.

>   static void _update_runstate_area(struct vcpu *v)
>   {
> +    /* XXX: this should be removed */
>       if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
>            !(v->arch.flags & TF_kernel_mode) )
>           v->arch.pv.need_update_runstate_area = 1;
> diff --git a/xen/common/domain.c b/xen/common/domain.c
> index 7cc9526139..acc6f90ba3 100644
> --- a/xen/common/domain.c
> +++ b/xen/common/domain.c
> @@ -161,6 +161,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>       v->dirty_cpu = VCPU_CPU_CLEAN;
>   
>       spin_lock_init(&v->virq_lock);
> +    spin_lock_init(&v->runstate_guest_lock);
>   
>       tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
>   
> @@ -691,6 +692,66 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>       return 0;
>   }
>   
> +static void  unmap_runstate_area(struct vcpu *v, unsigned int lock)

NIT: There is an extra space after void

Also, AFAICT, the lock is only taking two values. Please switch to a bool.

> +{
> +    mfn_t mfn;
> +
> +    if ( ! runstate_guest(v) )

NIT: We don't usually put a space after !.

But shouldn't this be checked within the lock?


> +        return;
> +
> +    if (lock)

NIT: if ( ... )

> +        spin_lock(&v->runstate_guest_lock);
> +
> +    mfn = domain_page_map_to_mfn(runstate_guest(v));
> +
> +    unmap_domain_page_global((void *)
> +                            ((unsigned long)v->runstate_guest &
> +                             PAGE_MASK));
> +
> +    put_page_and_type(mfn_to_page(mfn));
> +    runstate_guest(v) = NULL;
> +
> +    if (lock)

Ditto.

> +        spin_unlock(&v->runstate_guest_lock);
> +}
> +
> +static int map_runstate_area(struct vcpu *v,
> +                    struct vcpu_register_runstate_memory_area *area)
> +{
> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
> +    void *mapping;
> +    struct page_info *page;
> +    size_t size = sizeof(struct vcpu_runstate_info);
> +
> +    ASSERT(runstate_guest(v) == NULL);
> +
> +    /* do not allow an area crossing 2 pages */
> +    if ( offset > (PAGE_SIZE - size) )
> +        return -EINVAL;

This is a change in behavior for the guest. If we are going forward with 
this, this will want a separate patch with its own explanation why this 
is done.

> +
> +#ifdef CONFIG_ARM
> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);

A guest is allowed to setup the runstate for a different vCPU than the 
current one. This will lead to get_page_from_gva() to fail as the 
function cannot yet work with a vCPU other than current.

AFAICT, there is no restriction on when the runstate hypercall can be 
called. So this can even be called before the vCPU is brought up.

I was going to suggest to use the current vCPU for translating the 
address. However, it would be reasonable for an OS to use the same 
virtual address for all the vCPUs assuming the page-tables are different 
per vCPU.

Recent Linux are using a per-cpu area, so the virtual address should be 
different for each vCPU. But I don't know how the other OSes works. 
Roger should be able to help for FreeBSD at least.

I have CCed Paul for the Windows drivers.

If we decide to introduce some restriction then they should be explained 
in the commit message and also documented in the public header (we have 
been pretty bad at documenting change in the past!).

Cheers,
Julien Grall May 28, 2020, 7:12 p.m. UTC | #4
Hi,

On 28/05/2020 18:19, Bertrand Marquis wrote:
>>> +
>>> +    return 0;
>>> +}
>>> +
>>> int domain_kill(struct domain *d)
>>> {
>>>      int rc = 0;
>>> @@ -727,7 +788,10 @@ int domain_kill(struct domain *d)
>>>          if ( cpupool_move_domain(d, cpupool0) )
>>>              return -ERESTART;
>>>          for_each_vcpu ( d, v )
>>> +        {
>>> +            unmap_runstate_area(v, 0);
>>
>> Why is it not appropriate here to hold the lock? It might not be
>> technically needed, but still should work?
> 
> In a killing scenario you might stop a core while it was rescheduling.
> Couldn’t a core be killed using a cross core interrupt ?

You can't stop a vCPU in the middle of the context switch. The vCPU can 
only be scheduled out at specific point in Xen.

> If this is the case then I would need to do masked interrupt locking sections to prevent that case ?

At the beginning of the function domain_kill() the domain will be paused 
(see domain_pause()). After this steps none of the vCPUs will be running 
or be scheduled.

You should technically use the lock everywhere to avoid static analyzer 
throwing a warning because you access variable with and without the 
lock. A comment would at least be useful in the code if we go ahead with 
this.

As an aside, I think you want the lock to always disable the interrupts 
otherwise check_lock() (this can be enabled with CONFIG_DEBUG_LOCKS only 
on x86 though) will shout at you because your lock can be taken in both 
IRQ-safe and IRQ-unsafe context.

Cheers,
Roger Pau Monné May 29, 2020, 7:19 a.m. UTC | #5
On Thu, May 28, 2020 at 07:54:35PM +0100, Julien Grall wrote:
> Hi Bertrand,
> 
> Thank you for the patch.
> 
> On 28/05/2020 16:25, Bertrand Marquis wrote:
> > +static int map_runstate_area(struct vcpu *v,
> > +                    struct vcpu_register_runstate_memory_area *area)
> > +{
> > +    unsigned long offset = area->addr.p & ~PAGE_MASK;
> > +    void *mapping;
> > +    struct page_info *page;
> > +    size_t size = sizeof(struct vcpu_runstate_info);
> > +
> > +    ASSERT(runstate_guest(v) == NULL);
> > +
> > +    /* do not allow an area crossing 2 pages */
> > +    if ( offset > (PAGE_SIZE - size) )
> > +        return -EINVAL;
> 
> This is a change in behavior for the guest. If we are going forward with
> this, this will want a separate patch with its own explanation why this is
> done.

I don't think we can go this route without supporting crossing a page
boundary.

Linux will BUG if VCPUOP_register_runstate_memory_area fails, and
AFAICT there's no check in Linux to assure the runstate area doesn't
cross a page boundary. If we want to go this route we must support the
area crossing a page boundary, or else we will break existing
guests.

> > +
> > +#ifdef CONFIG_ARM
> > +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
> 
> A guest is allowed to setup the runstate for a different vCPU than the
> current one. This will lead to get_page_from_gva() to fail as the function
> cannot yet work with a vCPU other than current.
> 
> AFAICT, there is no restriction on when the runstate hypercall can be
> called. So this can even be called before the vCPU is brought up.
> 
> I was going to suggest to use the current vCPU for translating the address.
> However, it would be reasonable for an OS to use the same virtual address
> for all the vCPUs assuming the page-tables are different per vCPU.

Hm, it's a tricky question. Using the current vCPU page tables would
seem like a good compromise, but it needs to get added to the header
as a note, and this should ideally be merged at the start of a
development cycle to get people time to test and report issues.

> Recent Linux are using a per-cpu area, so the virtual address should be
> different for each vCPU. But I don't know how the other OSes works. Roger
> should be able to help for FreeBSD at least.

FreeBSD doesn't use VCPUOP_register_runstate_memory_area at all, so we
are safe in that regard.

I never got around to implementing the required scheduler changes in
order to support stolen time accounting.  Note sure this has changed
since I last checked, the bhyve and KVM guys also had interest in
properly accounting for stolen time on FreeBSD IIRC.

Roger.
Jan Beulich May 29, 2020, 7:35 a.m. UTC | #6
On 28.05.2020 20:54, Julien Grall wrote:
> On 28/05/2020 16:25, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>
>> This patch is modifying runstate handling to map the area given by the
>> guest inside Xen during the hypercall.
>> This is removing the guest virtual to physical conversion during context
>> switches which removes the bug
> 
> It would be good to spell out that a virtual address is not stable. So 
> relying on it is wrong.

Guests at present are permitted to change the mapping underneath the
virtual address provided (this may not be the best idea, but the
interface is like it is). Therefore I don't think the present
interface can be changed like this. Instead a new interface will need
adding which takes a guest physical address instead. (Which, in the
end, will merely be one tiny step towards making the hypercall
interfaces use guest physical addresses. And it would be nice if an
overall concept was hashed out first how that conversion should
occur, such that the change here could at least be made fit that
planned model. For example, an option might be to retain all present
hypercall numbering and simply dedicate a bit in the top level
hypercall numbers indicating whether _all_ involved addresses for
that operation are physical vs virtual ones.)

Jan
Bertrand Marquis May 29, 2020, 8:13 a.m. UTC | #7
Hi Julien,

> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
> 
> Hi Bertrand,
> 
> Thank you for the patch.
> 
> On 28/05/2020 16:25, Bertrand Marquis wrote:
>> At the moment on Arm, a Linux guest running with KTPI enabled will
>> cause the following error when a context switch happens in user mode:
>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>> This patch is modifying runstate handling to map the area given by the
>> guest inside Xen during the hypercall.
>> This is removing the guest virtual to physical conversion during context
>> switches which removes the bug
> 
> It would be good to spell out that a virtual address is not stable. So relying on it is wrong.
> 
>> and improve performance by preventing to
>> walk page tables during context switches.
> 
> With Secret free hypervisor in mind, I would like to suggest to map/unmap the runstate during context switch.
> 
> The cost should be minimal when there is a direct map (i.e on Arm64 and x86) and still provide better performance on Arm32.

Even with a minimal cost this is still adding some non real-time behaviour to the context switch.
But definitely from the security point of view as we have to map a page from the guest, we could have accessible in Xen some data that should not be there.
There is a trade here where:
- xen can protect by map/unmapping
- a guest which wants to secure his data should either not use it or make sure there is nothing else in the page

That sounds like a thread local storage kind of problematic where we want data from xen to be accessible fast from the guest and easy to be modified from xen.

> 
> The change should be minimal compare to the current approach but this could be taken care separately if you don't have time.

I could add that to the serie in a separate patch so that it can be discussed and test separately ?

> 
>> --
>> In the current status, this patch is only working on Arm and needs to
>> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>> ---
>>  xen/arch/arm/domain.c   | 32 +++++++++-------
>>  xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>>  xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>>  xen/include/xen/sched.h | 11 ++++--
>>  4 files changed, 124 insertions(+), 54 deletions(-)
>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>> index 31169326b2..799b0e0103 100644
>> --- a/xen/arch/arm/domain.c
>> +++ b/xen/arch/arm/domain.c
>> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>>  /* Update per-VCPU guest runstate shared memory area (if registered). */
>>  static void update_runstate_area(struct vcpu *v)
>>  {
>> -    void __user *guest_handle = NULL;
>> -    struct vcpu_runstate_info runstate;
>> +    struct vcpu_runstate_info *runstate;
>>  -    if ( guest_handle_is_null(runstate_guest(v)) )
>> +    /* XXX why do we accept not to block here */
>> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>>          return;
>>  -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>> +    runstate = runstate_guest(v);
>> +
>> +    if (runstate == NULL)
>> +    {
>> +        spin_unlock(&v->runstate_guest_lock);
>> +        return;
>> +    }
>>        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>      {
>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>> -        guest_handle--;
>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>> -        __raw_copy_to_guest(guest_handle,
>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>          smp_wmb();
> 
> Because you set v->runstate.state_entry_time below, the placement of the barrier seems a bit odd.
> 
> I would suggest to update v->runstate.state_entry_time first and then update runstate->state_entry_time.

We do want the guest to know when we modify the runstate so we need to make sure the XEN_RUNSTATE_UPDATE is actually set in a visible way before we do the memcpy.
That’s why the barrier is after.

> 
>> +        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>      }
>>  -    __copy_to_guest(runstate_guest(v), &runstate, 1);
>> +    memcpy(runstate, &v->runstate, sizeof(v->runstate));
>>  -    if ( guest_handle )
>> +    if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>      {
>> -        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>> +        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
>>          smp_wmb();
> 
> You want to update runstate->state_entry_time after the barrier not before.
Agree

> 
>>  static void _update_runstate_area(struct vcpu *v)
>>  {
>> +    /* XXX: this should be removed */
>>      if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
>>           !(v->arch.flags & TF_kernel_mode) )
>>          v->arch.pv.need_update_runstate_area = 1;
>> diff --git a/xen/common/domain.c b/xen/common/domain.c
>> index 7cc9526139..acc6f90ba3 100644
>> --- a/xen/common/domain.c
>> +++ b/xen/common/domain.c
>> @@ -161,6 +161,7 @@ struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
>>      v->dirty_cpu = VCPU_CPU_CLEAN;
>>        spin_lock_init(&v->virq_lock);
>> +    spin_lock_init(&v->runstate_guest_lock);
>>        tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
>>  @@ -691,6 +692,66 @@ int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
>>      return 0;
>>  }
>>  +static void  unmap_runstate_area(struct vcpu *v, unsigned int lock)
> 
> NIT: There is an extra space after void
> 
> Also, AFAICT, the lock is only taking two values. Please switch to a bool.

Agree

> 
>> +{
>> +    mfn_t mfn;
>> +
>> +    if ( ! runstate_guest(v) )
> 
> NIT: We don't usually put a space after !.
> 
> But shouldn't this be checked within the lock?

Agree

> 
> 
>> +        return;
>> +
>> +    if (lock)
> 
> NIT: if ( ... )
> 

Ack

>> +        spin_lock(&v->runstate_guest_lock);
>> +
>> +    mfn = domain_page_map_to_mfn(runstate_guest(v));
>> +
>> +    unmap_domain_page_global((void *)
>> +                            ((unsigned long)v->runstate_guest &
>> +                             PAGE_MASK));
>> +
>> +    put_page_and_type(mfn_to_page(mfn));
>> +    runstate_guest(v) = NULL;
>> +
>> +    if (lock)
> 
> Ditto.

Ack

> 
>> +        spin_unlock(&v->runstate_guest_lock);
>> +}
>> +
>> +static int map_runstate_area(struct vcpu *v,
>> +                    struct vcpu_register_runstate_memory_area *area)
>> +{
>> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
>> +    void *mapping;
>> +    struct page_info *page;
>> +    size_t size = sizeof(struct vcpu_runstate_info);
>> +
>> +    ASSERT(runstate_guest(v) == NULL);
>> +
>> +    /* do not allow an area crossing 2 pages */
>> +    if ( offset > (PAGE_SIZE - size) )
>> +        return -EINVAL;
> 
> This is a change in behavior for the guest. If we are going forward with this, this will want a separate patch with its own explanation why this is done.

Ack I need to add support for an area crossing pages

> 
>> +
>> +#ifdef CONFIG_ARM
>> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
> 
> A guest is allowed to setup the runstate for a different vCPU than the current one. This will lead to get_page_from_gva() to fail as the function cannot yet work with a vCPU other than current.

If the area is mapped per cpu but isn’t the aim of this to have a way to check other cpus status ?

> 
> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.

I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?


> 
> I was going to suggest to use the current vCPU for translating the address. However, it would be reasonable for an OS to use the same virtual address for all the vCPUs assuming the page-tables are different per vCPU.
> 
> Recent Linux are using a per-cpu area, so the virtual address should be different for each vCPU. But I don't know how the other OSes works. Roger should be able to help for FreeBSD at least.
> 
> I have CCed Paul for the Windows drivers.
> 
> If we decide to introduce some restriction then they should be explained in the commit message and also documented in the public header (we have been pretty bad at documenting change in the past!).

Agree

Cheers
Bertrand
Bertrand Marquis May 29, 2020, 8:15 a.m. UTC | #8
> On 28 May 2020, at 20:12, Julien Grall <julien@xen.org> wrote:
> 
> Hi,
> 
> On 28/05/2020 18:19, Bertrand Marquis wrote:
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> int domain_kill(struct domain *d)
>>>> {
>>>>     int rc = 0;
>>>> @@ -727,7 +788,10 @@ int domain_kill(struct domain *d)
>>>>         if ( cpupool_move_domain(d, cpupool0) )
>>>>             return -ERESTART;
>>>>         for_each_vcpu ( d, v )
>>>> +        {
>>>> +            unmap_runstate_area(v, 0);
>>> 
>>> Why is it not appropriate here to hold the lock? It might not be
>>> technically needed, but still should work?
>> In a killing scenario you might stop a core while it was rescheduling.
>> Couldn’t a core be killed using a cross core interrupt ?
> 
> You can't stop a vCPU in the middle of the context switch. The vCPU can only be scheduled out at specific point in Xen.

Ok

> 
>> If this is the case then I would need to do masked interrupt locking sections to prevent that case ?
> 
> At the beginning of the function domain_kill() the domain will be paused (see domain_pause()). After this steps none of the vCPUs will be running or be scheduled.
> 
> You should technically use the lock everywhere to avoid static analyzer throwing a warning because you access variable with and without the lock. A comment would at least be useful in the code if we go ahead with this.
> 
> As an aside, I think you want the lock to always disable the interrupts otherwise check_lock() (this can be enabled with CONFIG_DEBUG_LOCKS only on x86 though) will shout at you because your lock can be taken in both IRQ-safe and IRQ-unsafe context.

Ok I understand that.
I will take the lock here.

Cheers
Bertrand
Bertrand Marquis May 29, 2020, 8:24 a.m. UTC | #9
> On 29 May 2020, at 08:19, Roger Pau Monné <roger@xen.org> wrote:
> 
> On Thu, May 28, 2020 at 07:54:35PM +0100, Julien Grall wrote:
>> Hi Bertrand,
>> 
>> Thank you for the patch.
>> 
>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>> +static int map_runstate_area(struct vcpu *v,
>>> +                    struct vcpu_register_runstate_memory_area *area)
>>> +{
>>> +    unsigned long offset = area->addr.p & ~PAGE_MASK;
>>> +    void *mapping;
>>> +    struct page_info *page;
>>> +    size_t size = sizeof(struct vcpu_runstate_info);
>>> +
>>> +    ASSERT(runstate_guest(v) == NULL);
>>> +
>>> +    /* do not allow an area crossing 2 pages */
>>> +    if ( offset > (PAGE_SIZE - size) )
>>> +        return -EINVAL;
>> 
>> This is a change in behavior for the guest. If we are going forward with
>> this, this will want a separate patch with its own explanation why this is
>> done.
> 
> I don't think we can go this route without supporting crossing a page
> boundary.
> 
> Linux will BUG if VCPUOP_register_runstate_memory_area fails, and
> AFAICT there's no check in Linux to assure the runstate area doesn't
> cross a page boundary. If we want to go this route we must support the
> area crossing a page boundary, or else we will break existing
> guests.

Agree, I will add cross page boundary support.

> 
>>> +
>>> +#ifdef CONFIG_ARM
>>> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
>> 
>> A guest is allowed to setup the runstate for a different vCPU than the
>> current one. This will lead to get_page_from_gva() to fail as the function
>> cannot yet work with a vCPU other than current.
>> 
>> AFAICT, there is no restriction on when the runstate hypercall can be
>> called. So this can even be called before the vCPU is brought up.
>> 
>> I was going to suggest to use the current vCPU for translating the address.
>> However, it would be reasonable for an OS to use the same virtual address
>> for all the vCPUs assuming the page-tables are different per vCPU.
> 
> Hm, it's a tricky question. Using the current vCPU page tables would
> seem like a good compromise, but it needs to get added to the header
> as a note, and this should ideally be merged at the start of a
> development cycle to get people time to test and report issues.

I agree and as this will not go in 4.14 we could got this route to have this in 4.15 ?

Bertrand

> 
>> Recent Linux are using a per-cpu area, so the virtual address should be
>> different for each vCPU. But I don't know how the other OSes works. Roger
>> should be able to help for FreeBSD at least.
> 
> FreeBSD doesn't use VCPUOP_register_runstate_memory_area at all, so we
> are safe in that regard.
> 
> I never got around to implementing the required scheduler changes in
> order to support stolen time accounting.  Note sure this has changed
> since I last checked, the bhyve and KVM guys also had interest in
> properly accounting for stolen time on FreeBSD IIRC.
> 
> Roger.
Bertrand Marquis May 29, 2020, 8:32 a.m. UTC | #10
Hi Jan

> On 29 May 2020, at 08:35, Jan Beulich <jbeulich@suse.com> wrote:
> 
> On 28.05.2020 20:54, Julien Grall wrote:
>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>> cause the following error when a context switch happens in user mode:
>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>> 
>>> This patch is modifying runstate handling to map the area given by the
>>> guest inside Xen during the hypercall.
>>> This is removing the guest virtual to physical conversion during context
>>> switches which removes the bug
>> 
>> It would be good to spell out that a virtual address is not stable. So 
>> relying on it is wrong.
> 
> Guests at present are permitted to change the mapping underneath the
> virtual address provided (this may not be the best idea, but the
> interface is like it is). Therefore I don't think the present
> interface can be changed like this. Instead a new interface will need
> adding which takes a guest physical address instead. (Which, in the
> end, will merely be one tiny step towards making the hypercall
> interfaces use guest physical addresses. And it would be nice if an
> overall concept was hashed out first how that conversion should
> occur, such that the change here could at least be made fit that
> planned model. For example, an option might be to retain all present
> hypercall numbering and simply dedicate a bit in the top level
> hypercall numbers indicating whether _all_ involved addresses for
> that operation are physical vs virtual ones.)

I definitely fully agree that moving to interfaces using physical addresses
would definitely be better but would need new hypercall numbers (or the
bit system you suggest) to keep backward compatibility.

Regarding the change of virtual address, even though this is theoretically
possible with the current interface I do not really see how this could be
handled cleanly with KPTI or even without it as this would not be an atomic
change on the guest side so the only way to cleanly do this would be to
do an hypercall to remove the address in xen and then recall the hypercall
to register the new address.

So the only way to solve the KPTI issue would actually be to create a new
hypercall and keep the current bug/problem ?

Bertrand
Jan Beulich May 29, 2020, 8:37 a.m. UTC | #11
On 29.05.2020 10:32, Bertrand Marquis wrote:
> So the only way to solve the KPTI issue would actually be to create a new
> hypercall and keep the current bug/problem ?

That's my view on it at least, yes.

Jan
Jan Beulich May 29, 2020, 8:45 a.m. UTC | #12
On 29.05.2020 10:13, Bertrand Marquis wrote:
>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
> 
> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?

I don't think so: The hypervisor uses copy_to_guest() to protect
itself from the addresses to be invalid at the time of copying.
If the guest doesn't make sure they're valid at that time, it
simply won't get the information (perhaps until Xen's next
attempt to copy it out).

You may want to take a look at the x86 side of this (also the
vCPU time updating): Due to the way x86-64 PV guests work, the
address may legitimately be unmapped at the time Xen wants to
copy it, when the vCPU is currently executing guest user mode
code. In such a case the copy gets retried the next time the
guest transitions from user to kernel mode (which involves a
page table change).

Jan
Bertrand Marquis May 29, 2020, 9:18 a.m. UTC | #13
Hi Jan,

> On 29 May 2020, at 09:45, Jan Beulich <jbeulich@suse.com> wrote:
> 
> On 29.05.2020 10:13, Bertrand Marquis wrote:
>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
>> 
>> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
>> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
> 
> I don't think so: The hypervisor uses copy_to_guest() to protect
> itself from the addresses to be invalid at the time of copying.
> If the guest doesn't make sure they're valid at that time, it
> simply won't get the information (perhaps until Xen's next
> attempt to copy it out).
> 
> You may want to take a look at the x86 side of this (also the
> vCPU time updating): Due to the way x86-64 PV guests work, the
> address may legitimately be unmapped at the time Xen wants to
> copy it, when the vCPU is currently executing guest user mode
> code. In such a case the copy gets retried the next time the
> guest transitions from user to kernel mode (which involves a
> page table change).

If I understand everything correctly runstate is updated only if there is
a context switch in xen while the guest is running in kernel mode and
if the address is mapped at that time.

So this is a best effort in Xen and the guest cannot really rely on the
runstate information (as it might not be up to date).
Could this have impacts somehow if this is used for scheduling ?

In the end the only accepted trade off would be to:
- reduce error verbosity and just ignore it
- introduce a new system call using a physical address
  -> Using a virtual address with restrictions sounds very complex
  to document (current core, no remapping).

But it feels like having only one hypercall using guest physical addresses
would not really be logic and this kind of change should be made across
all hypercalls if it is done.
 
Bertrand
Roger Pau Monné May 29, 2020, 9:27 a.m. UTC | #14
On Fri, May 29, 2020 at 09:18:42AM +0000, Bertrand Marquis wrote:
> Hi Jan,
> 
> > On 29 May 2020, at 09:45, Jan Beulich <jbeulich@suse.com> wrote:
> > 
> > On 29.05.2020 10:13, Bertrand Marquis wrote:
> >>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
> >>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
> >> 
> >> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
> >> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
> > 
> > I don't think so: The hypervisor uses copy_to_guest() to protect
> > itself from the addresses to be invalid at the time of copying.
> > If the guest doesn't make sure they're valid at that time, it
> > simply won't get the information (perhaps until Xen's next
> > attempt to copy it out).
> > 
> > You may want to take a look at the x86 side of this (also the
> > vCPU time updating): Due to the way x86-64 PV guests work, the
> > address may legitimately be unmapped at the time Xen wants to
> > copy it, when the vCPU is currently executing guest user mode
> > code. In such a case the copy gets retried the next time the
> > guest transitions from user to kernel mode (which involves a
> > page table change).
> 
> If I understand everything correctly runstate is updated only if there is
> a context switch in xen while the guest is running in kernel mode and
> if the address is mapped at that time.
> 
> So this is a best effort in Xen and the guest cannot really rely on the
> runstate information (as it might not be up to date).
> Could this have impacts somehow if this is used for scheduling ?
> 
> In the end the only accepted trade off would be to:
> - reduce error verbosity and just ignore it
> - introduce a new system call using a physical address
>   -> Using a virtual address with restrictions sounds very complex
>   to document (current core, no remapping).
> 
> But it feels like having only one hypercall using guest physical addresses
> would not really be logic and this kind of change should be made across
> all hypercalls if it is done.

FRT, there are other hypercalls using a physical address instead of a
linear one, see VCPUOP_register_vcpu_info for example. It's just a
mixed bag right now, with some hypercalls using a linear address and
some using a physical one.

I think introducing a new hypercall that uses a physical address would
be fine, and then you can add a set of restrictions similar to the
ones listed by VCPUOP_register_vcpu_info.

Changing the current hypercall as proposed is risky, but I think the
current behavior is broken by design specially on auto translated
guests, even more with XPTI.

Thanks, Roger.
Jan Beulich May 29, 2020, 9:31 a.m. UTC | #15
On 29.05.2020 11:18, Bertrand Marquis wrote:
> Hi Jan,
> 
>> On 29 May 2020, at 09:45, Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 29.05.2020 10:13, Bertrand Marquis wrote:
>>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
>>>
>>> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
>>> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
>>
>> I don't think so: The hypervisor uses copy_to_guest() to protect
>> itself from the addresses to be invalid at the time of copying.
>> If the guest doesn't make sure they're valid at that time, it
>> simply won't get the information (perhaps until Xen's next
>> attempt to copy it out).
>>
>> You may want to take a look at the x86 side of this (also the
>> vCPU time updating): Due to the way x86-64 PV guests work, the
>> address may legitimately be unmapped at the time Xen wants to
>> copy it, when the vCPU is currently executing guest user mode
>> code. In such a case the copy gets retried the next time the
>> guest transitions from user to kernel mode (which involves a
>> page table change).
> 
> If I understand everything correctly runstate is updated only if there is
> a context switch in xen while the guest is running in kernel mode and
> if the address is mapped at that time.
> 
> So this is a best effort in Xen and the guest cannot really rely on the
> runstate information (as it might not be up to date).
> Could this have impacts somehow if this is used for scheduling ?

Yes, it could, and hence it's not really best effort only, but
said retry upon guest mode switch had been added years ago.
(This was one of the things overlooked when x86-64 support was
introduced, as x86-32 doesn't have this same problem.) The
updating is best-effort only as far as a misbehaving guest goes;
in all other aspects it's reliable, assuming that vCPU's only
look at their own data for the purpose of making decisions
(logging and alike are of course still fine, as long as people
are aware of the data possibly being stale).

> In the end the only accepted trade off would be to:
> - reduce error verbosity and just ignore it
> - introduce a new system call using a physical address
>   -> Using a virtual address with restrictions sounds very complex
>   to document (current core, no remapping).
> 
> But it feels like having only one hypercall using guest physical addresses
> would not really be logic and this kind of change should be made across
> all hypercalls if it is done.

Hence my request to preferably first settle on a model, such
that the change here could be simply the first step on that
journey.

Jan
Julien Grall May 29, 2020, 9:43 a.m. UTC | #16
Hi Bertrand,

On 29/05/2020 09:13, Bertrand Marquis wrote:
> Hi Julien,
> 
>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>
>> Hi Bertrand,
>>
>> Thank you for the patch.
>>
>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>> cause the following error when a context switch happens in user mode:
>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>> This patch is modifying runstate handling to map the area given by the
>>> guest inside Xen during the hypercall.
>>> This is removing the guest virtual to physical conversion during context
>>> switches which removes the bug
>>
>> It would be good to spell out that a virtual address is not stable. So relying on it is wrong.
>>
>>> and improve performance by preventing to
>>> walk page tables during context switches.
>>
>> With Secret free hypervisor in mind, I would like to suggest to map/unmap the runstate during context switch.
>>
>> The cost should be minimal when there is a direct map (i.e on Arm64 and x86) and still provide better performance on Arm32.
> 
> Even with a minimal cost this is still adding some non real-time behaviour to the context switch.

Just to be clear, by minimal I meant the mapping part is just a 
virt_to_mfn() call and the unmapping is a NOP.

IHMO, if virt_to_mfn() ends up to add non-RT behavior then you have much 
bigger problem than just this call.

> But definitely from the security point of view as we have to map a page from the guest, we could have accessible in Xen some data that should not be there.
> There is a trade here where:
> - xen can protect by map/unmapping
> - a guest which wants to secure his data should either not use it or make sure there is nothing else in the page

Both are valid and depends on your setup. One may want to protect all 
the existing guests, so modifying a guest may not be a solution.

> 
> That sounds like a thread local storage kind of problematic where we want data from xen to be accessible fast from the guest and easy to be modified from xen.

Agree. On x86, they have a perdomain mapping so it would be possible to 
do it. We would need to add the feature on Arm.

> 
>>
>> The change should be minimal compare to the current approach but this could be taken care separately if you don't have time.
> 
> I could add that to the serie in a separate patch so that it can be discussed and test separately ?

If you are picking the task, then I would suggest to add it directly in 
this patch.

> 
>>
>>> --
>>> In the current status, this patch is only working on Arm and needs to
>>> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
>>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>>> ---
>>>   xen/arch/arm/domain.c   | 32 +++++++++-------
>>>   xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>>>   xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>>>   xen/include/xen/sched.h | 11 ++++--
>>>   4 files changed, 124 insertions(+), 54 deletions(-)
>>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>>> index 31169326b2..799b0e0103 100644
>>> --- a/xen/arch/arm/domain.c
>>> +++ b/xen/arch/arm/domain.c
>>> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>>>   /* Update per-VCPU guest runstate shared memory area (if registered). */
>>>   static void update_runstate_area(struct vcpu *v)
>>>   {
>>> -    void __user *guest_handle = NULL;
>>> -    struct vcpu_runstate_info runstate;
>>> +    struct vcpu_runstate_info *runstate;
>>>   -    if ( guest_handle_is_null(runstate_guest(v)) )
>>> +    /* XXX why do we accept not to block here */
>>> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>>>           return;
>>>   -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>>> +    runstate = runstate_guest(v);
>>> +
>>> +    if (runstate == NULL)
>>> +    {
>>> +        spin_unlock(&v->runstate_guest_lock);
>>> +        return;
>>> +    }
>>>         if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>       {
>>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>>> -        guest_handle--;
>>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>> -        __raw_copy_to_guest(guest_handle,
>>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>           smp_wmb();
>>
>> Because you set v->runstate.state_entry_time below, the placement of the barrier seems a bit odd.
>>
>> I would suggest to update v->runstate.state_entry_time first and then update runstate->state_entry_time.
> 
> We do want the guest to know when we modify the runstate so we need to make sure the XEN_RUNSTATE_UPDATE is actually set in a visible way before we do the memcpy.
> That’s why the barrier is after.

I think you misunderstood my comment here. I meant to write the 
following code:

v->runstate.state_entry_time = ...
runstate->state_entry_time = ...
smp_wmb()

So it is much clearer that the barrier is because 
runstate->state_entry_time needs to be updated before the memcpy().

>>> +
>>> +#ifdef CONFIG_ARM
>>> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
>>
>> A guest is allowed to setup the runstate for a different vCPU than the current one. This will lead to get_page_from_gva() to fail as the function cannot yet work with a vCPU other than current.
> 
> If the area is mapped per cpu but isn’t the aim of this to have a way to check other cpus status ?

The aim is to collect how much time a vCPU has been unscheduled. This 
doesn't have to be run from a different vCPU.

Anyway, my point is the hypercall allows it today. If you want to make 
such restriction, then we need to agree on it and document it.

> 
>>
>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
> 
> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?

Well, that's why you see errors when using KPTI ;). If you use a global 
mapping, then it is not possible to continue without validating the address.

But to do this, you will have to put restriction on the hypercalls. I 
would be OK to make such restriction on Arm.

Cheers,
Hongyan Xia May 29, 2020, 9:49 a.m. UTC | #17
On Fri, 2020-05-29 at 08:13 +0000, Bertrand Marquis wrote:
> Hi Julien,
> 
> > On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
> > 
> > Hi Bertrand,
> > 
> > Thank you for the patch.
> > 
> > On 28/05/2020 16:25, Bertrand Marquis wrote:
> > > At the moment on Arm, a Linux guest running with KTPI enabled
> > > will
> > > cause the following error when a context switch happens in user
> > > mode:
> > > (XEN) p2m.c:1890: d1v0: Failed to walk page-table va
> > > 0xffffff837ebe0cd0
> > > This patch is modifying runstate handling to map the area given
> > > by the
> > > guest inside Xen during the hypercall.
> > > This is removing the guest virtual to physical conversion during
> > > context
> > > switches which removes the bug
> > 
> > It would be good to spell out that a virtual address is not stable.
> > So relying on it is wrong.
> > 
> > > and improve performance by preventing to
> > > walk page tables during context switches.
> > 
> > With Secret free hypervisor in mind, I would like to suggest to
> > map/unmap the runstate during context switch.
> > 
> > The cost should be minimal when there is a direct map (i.e on Arm64
> > and x86) and still provide better performance on Arm32.
> 
> Even with a minimal cost this is still adding some non real-time
> behaviour to the context switch.
> But definitely from the security point of view as we have to map a
> page from the guest, we could have accessible in Xen some data that
> should not be there.
> There is a trade here where:
> - xen can protect by map/unmapping
> - a guest which wants to secure his data should either not use it or
> make sure there is nothing else in the page
> 
> That sounds like a thread local storage kind of problematic where we
> want data from xen to be accessible fast from the guest and easy to
> be modified from xen.

Can't we just map it into the per-domain region, so that the mapping
and unmapping of runstate is baked into the per-domain region switch
itself during context switch?

Anyway, I am glad to help with secret-free bits if required, although
my knowledge on the Xen Arm side is limited.

Hongyan
Julien Grall May 29, 2020, 10:52 a.m. UTC | #18
Hi,

On 29/05/2020 10:18, Bertrand Marquis wrote:
>> On 29 May 2020, at 09:45, Jan Beulich <jbeulich@suse.com> wrote:
>>
>> On 29.05.2020 10:13, Bertrand Marquis wrote:
>>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
>>>
>>> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
>>> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
>>
>> I don't think so: The hypervisor uses copy_to_guest() to protect
>> itself from the addresses to be invalid at the time of copying.
>> If the guest doesn't make sure they're valid at that time, it
>> simply won't get the information (perhaps until Xen's next
>> attempt to copy it out).
>>
>> You may want to take a look at the x86 side of this (also the
>> vCPU time updating): Due to the way x86-64 PV guests work, the
>> address may legitimately be unmapped at the time Xen wants to
>> copy it, when the vCPU is currently executing guest user mode
>> code. In such a case the copy gets retried the next time the
>> guest transitions from user to kernel mode (which involves a
>> page table change).
> 
> If I understand everything correctly runstate is updated only if there is
> a context switch in xen while the guest is running in kernel mode and
> if the address is mapped at that time.
> 
> So this is a best effort in Xen and the guest cannot really rely on the
> runstate information (as it might not be up to date).
> Could this have impacts somehow if this is used for scheduling ?
> 
> In the end the only accepted trade off would be to:
> - reduce error verbosity and just ignore it

The error is already a dprintk(XENLOG_G_DEBUG,...). So you can't really 
do better in term of verbosity.

But I would still like to keep it as there was some weirdness hapenning 
also in the non-KPTI case (see [1]).

> - introduce a new system call using a physical address
>    -> Using a virtual address with restrictions sounds very complex
>    to document (current core, no remapping).
> 
> But it feels like having only one hypercall using guest physical addresses
> would not really be logic and this kind of change should be made across
> all hypercalls if it is done.

This is not correct, there are other hypercalls using guest physical 
address (for instance, EVTCHNOP_init_control).

At least on Arm, this is the only hypercall that requires to keep the 
virtual address across the hypercall.

For all the other hypercalls, the virtual address is used for buffer. 
This is still risky but less than this one. It is also going to be a 
major rework that would require quite a bit of work.

So I would rather trying to fix the most concerning instance now and 
address the rest afterwards.

Cheers,

[1] https://lists.xen.org/archives/html/xen-devel/2017-11/msg00942.html
Julien Grall May 29, 2020, 10:59 a.m. UTC | #19
Hi Jan,

On 29/05/2020 08:35, Jan Beulich wrote:
> On 28.05.2020 20:54, Julien Grall wrote:
>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>> cause the following error when a context switch happens in user mode:
>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>>
>>> This patch is modifying runstate handling to map the area given by the
>>> guest inside Xen during the hypercall.
>>> This is removing the guest virtual to physical conversion during context
>>> switches which removes the bug
>>
>> It would be good to spell out that a virtual address is not stable. So
>> relying on it is wrong.
> 
> Guests at present are permitted to change the mapping underneath the
> virtual address provided (this may not be the best idea, but the
> interface is like it is).

Well yes, it could be point to data used by the userpsace. So you could 
corrupt a program. It is not very great.

So I would be ready to accept such restriction on Arm at least because 
KPTI use case is far more concerning that a kernel trying to change the 
location of the runstate in physical memory.

Cheers,
Roger Pau Monné May 29, 2020, 1:09 p.m. UTC | #20
On Fri, May 29, 2020 at 11:59:40AM +0100, Julien Grall wrote:
> Hi Jan,
> 
> On 29/05/2020 08:35, Jan Beulich wrote:
> > On 28.05.2020 20:54, Julien Grall wrote:
> > > On 28/05/2020 16:25, Bertrand Marquis wrote:
> > > > At the moment on Arm, a Linux guest running with KTPI enabled will
> > > > cause the following error when a context switch happens in user mode:
> > > > (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> > > > 
> > > > This patch is modifying runstate handling to map the area given by the
> > > > guest inside Xen during the hypercall.
> > > > This is removing the guest virtual to physical conversion during context
> > > > switches which removes the bug
> > > 
> > > It would be good to spell out that a virtual address is not stable. So
> > > relying on it is wrong.
> > 
> > Guests at present are permitted to change the mapping underneath the
> > virtual address provided (this may not be the best idea, but the
> > interface is like it is).
> 
> Well yes, it could be point to data used by the userpsace. So you could
> corrupt a program. It is not very great.

Yes, that's also my worry with the current hypercall. The current
interface is IMO broken for autotranslated guests, at least in the way
it's currently used by OSes.

Roger.
Roger Pau Monné May 29, 2020, 1:26 p.m. UTC | #21
On Fri, May 29, 2020 at 08:32:51AM +0000, Bertrand Marquis wrote:
> Hi Jan
> 
> > On 29 May 2020, at 08:35, Jan Beulich <jbeulich@suse.com> wrote:
> > 
> > On 28.05.2020 20:54, Julien Grall wrote:
> >> On 28/05/2020 16:25, Bertrand Marquis wrote:
> >>> At the moment on Arm, a Linux guest running with KTPI enabled will
> >>> cause the following error when a context switch happens in user mode:
> >>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
> >>> 
> >>> This patch is modifying runstate handling to map the area given by the
> >>> guest inside Xen during the hypercall.
> >>> This is removing the guest virtual to physical conversion during context
> >>> switches which removes the bug
> >> 
> >> It would be good to spell out that a virtual address is not stable. So 
> >> relying on it is wrong.
> > 
> > Guests at present are permitted to change the mapping underneath the
> > virtual address provided (this may not be the best idea, but the
> > interface is like it is). Therefore I don't think the present
> > interface can be changed like this. Instead a new interface will need
> > adding which takes a guest physical address instead. (Which, in the
> > end, will merely be one tiny step towards making the hypercall
> > interfaces use guest physical addresses. And it would be nice if an
> > overall concept was hashed out first how that conversion should
> > occur, such that the change here could at least be made fit that
> > planned model. For example, an option might be to retain all present
> > hypercall numbering and simply dedicate a bit in the top level
> > hypercall numbers indicating whether _all_ involved addresses for
> > that operation are physical vs virtual ones.)
> 
> I definitely fully agree that moving to interfaces using physical addresses
> would definitely be better but would need new hypercall numbers (or the
> bit system you suggest) to keep backward compatibility.
> 
> Regarding the change of virtual address, even though this is theoretically
> possible with the current interface I do not really see how this could be
> handled cleanly with KPTI or even without it as this would not be an atomic
> change on the guest side so the only way to cleanly do this would be to
> do an hypercall to remove the address in xen and then recall the hypercall
> to register the new address.
> 
> So the only way to solve the KPTI issue would actually be to create a new
> hypercall and keep the current bug/problem ?

I think you will find it easier to just introduce a new hypercall that
uses a physical address and has a set of restrictions similar to
VCPUOP_register_vcpu_info for example than to bend the current
hypercall into doing something sane.

TBH I would just remove the error message on Arm from the current
hypercall, I don't think it's useful. If there's corruption caused by
the hypercall we could always make it a noop and simply update the
runstate area only once at registration and leave it like that. The
guest should check the timestamp in the data and realize the
information is stale.

Roger.
Julien Grall May 29, 2020, 1:37 p.m. UTC | #22
Hi,

On 29/05/2020 14:26, Roger Pau Monné wrote:
> TBH I would just remove the error message on Arm from the current
> hypercall, I don't think it's useful.
The message is part of the helpers get_page_from_gva() which is also 
used by copy_{to, from}_guest. While it may not be useful in the context 
of the runstate, it was introduced because there was some other weird 
bug happening before KPTI even existed (see [1]). I haven't yet managed 
to find the bottom line of the issue.

So I would still very much like to keep the message in place. Although 
we could reduce the number of cases where this is hapenning based on the 
fault.

Note this is a dprintk(XENLOG_G_DEBUG,...) so the verbosity of the 
logging is only for debug build.

Cheers,

[1] https://lists.xen.org/archives/html/xen-devel/2017-11/msg00942.html
Bertrand Marquis May 29, 2020, 1:53 p.m. UTC | #23
> On 29 May 2020, at 10:27, Roger Pau Monné <roger@xen.org> wrote:
> 
> On Fri, May 29, 2020 at 09:18:42AM +0000, Bertrand Marquis wrote:
>> Hi Jan,
>> 
>>> On 29 May 2020, at 09:45, Jan Beulich <jbeulich@suse.com> wrote:
>>> 
>>> On 29.05.2020 10:13, Bertrand Marquis wrote:
>>>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>>>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
>>>> 
>>>> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
>>>> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
>>> 
>>> I don't think so: The hypervisor uses copy_to_guest() to protect
>>> itself from the addresses to be invalid at the time of copying.
>>> If the guest doesn't make sure they're valid at that time, it
>>> simply won't get the information (perhaps until Xen's next
>>> attempt to copy it out).
>>> 
>>> You may want to take a look at the x86 side of this (also the
>>> vCPU time updating): Due to the way x86-64 PV guests work, the
>>> address may legitimately be unmapped at the time Xen wants to
>>> copy it, when the vCPU is currently executing guest user mode
>>> code. In such a case the copy gets retried the next time the
>>> guest transitions from user to kernel mode (which involves a
>>> page table change).
>> 
>> If I understand everything correctly runstate is updated only if there is
>> a context switch in xen while the guest is running in kernel mode and
>> if the address is mapped at that time.
>> 
>> So this is a best effort in Xen and the guest cannot really rely on the
>> runstate information (as it might not be up to date).
>> Could this have impacts somehow if this is used for scheduling ?
>> 
>> In the end the only accepted trade off would be to:
>> - reduce error verbosity and just ignore it
>> - introduce a new system call using a physical address
>>  -> Using a virtual address with restrictions sounds very complex
>>  to document (current core, no remapping).
>> 
>> But it feels like having only one hypercall using guest physical addresses
>> would not really be logic and this kind of change should be made across
>> all hypercalls if it is done.
> 
> FRT, there are other hypercalls using a physical address instead of a
> linear one, see VCPUOP_register_vcpu_info for example. It's just a
> mixed bag right now, with some hypercalls using a linear address and
> some using a physical one.
> 
> I think introducing a new hypercall that uses a physical address would
> be fine, and then you can add a set of restrictions similar to the
> ones listed by VCPUOP_register_vcpu_info.

Yes I found that and I also wondered why runstate was not included in the vcpu_info in fact.

> 
> Changing the current hypercall as proposed is risky, but I think the
> current behavior is broken by design specially on auto translated
> guests, even more with XPTI.
> 
> Thanks, Roger.
Bertrand Marquis May 29, 2020, 2:02 p.m. UTC | #24
> On 29 May 2020, at 10:43, Julien Grall <julien@xen.org> wrote:
> 
> Hi Bertrand,
> 
> On 29/05/2020 09:13, Bertrand Marquis wrote:
>> Hi Julien,
>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>> 
>>> Hi Bertrand,
>>> 
>>> Thank you for the patch.
>>> 
>>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>>> cause the following error when a context switch happens in user mode:
>>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>>> This patch is modifying runstate handling to map the area given by the
>>>> guest inside Xen during the hypercall.
>>>> This is removing the guest virtual to physical conversion during context
>>>> switches which removes the bug
>>> 
>>> It would be good to spell out that a virtual address is not stable. So relying on it is wrong.
>>> 
>>>> and improve performance by preventing to
>>>> walk page tables during context switches.
>>> 
>>> With Secret free hypervisor in mind, I would like to suggest to map/unmap the runstate during context switch.
>>> 
>>> The cost should be minimal when there is a direct map (i.e on Arm64 and x86) and still provide better performance on Arm32.
>> Even with a minimal cost this is still adding some non real-time behaviour to the context switch.
> 
> Just to be clear, by minimal I meant the mapping part is just a virt_to_mfn() call and the unmapping is a NOP.
> 
> IHMO, if virt_to_mfn() ends up to add non-RT behavior then you have much bigger problem than just this call.
> 
>> But definitely from the security point of view as we have to map a page from the guest, we could have accessible in Xen some data that should not be there.
>> There is a trade here where:
>> - xen can protect by map/unmapping
>> - a guest which wants to secure his data should either not use it or make sure there is nothing else in the page
> 
> Both are valid and depends on your setup. One may want to protect all the existing guests, so modifying a guest may not be a solution.

The fact to map/unmap is increasing the protection but not removing the problem completely.

> 
>> That sounds like a thread local storage kind of problematic where we want data from xen to be accessible fast from the guest and easy to be modified from xen.
> 
> Agree. On x86, they have a perdomain mapping so it would be possible to do it. We would need to add the feature on Arm.

That would definitely be cleaner.

> 
>>> 
>>> The change should be minimal compare to the current approach but this could be taken care separately if you don't have time.
>> I could add that to the serie in a separate patch so that it can be discussed and test separately ?
> 
> If you are picking the task, then I would suggest to add it directly in this patch.

I will see that but the discussion is leading more on a result where we accept the current status.

> 
>>> 
>>>> --
>>>> In the current status, this patch is only working on Arm and needs to
>>>> be fixed on X86 (see #error on domain.c for missing get_page_from_gva).
>>>> Signed-off-by: Bertrand Marquis <bertrand.marquis@arm.com>
>>>> ---
>>>>  xen/arch/arm/domain.c   | 32 +++++++++-------
>>>>  xen/arch/x86/domain.c   | 51 ++++++++++++++-----------
>>>>  xen/common/domain.c     | 84 ++++++++++++++++++++++++++++++++++-------
>>>>  xen/include/xen/sched.h | 11 ++++--
>>>>  4 files changed, 124 insertions(+), 54 deletions(-)
>>>> diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
>>>> index 31169326b2..799b0e0103 100644
>>>> --- a/xen/arch/arm/domain.c
>>>> +++ b/xen/arch/arm/domain.c
>>>> @@ -278,33 +278,37 @@ static void ctxt_switch_to(struct vcpu *n)
>>>>  /* Update per-VCPU guest runstate shared memory area (if registered). */
>>>>  static void update_runstate_area(struct vcpu *v)
>>>>  {
>>>> -    void __user *guest_handle = NULL;
>>>> -    struct vcpu_runstate_info runstate;
>>>> +    struct vcpu_runstate_info *runstate;
>>>>  -    if ( guest_handle_is_null(runstate_guest(v)) )
>>>> +    /* XXX why do we accept not to block here */
>>>> +    if ( !spin_trylock(&v->runstate_guest_lock) )
>>>>          return;
>>>>  -    memcpy(&runstate, &v->runstate, sizeof(runstate));
>>>> +    runstate = runstate_guest(v);
>>>> +
>>>> +    if (runstate == NULL)
>>>> +    {
>>>> +        spin_unlock(&v->runstate_guest_lock);
>>>> +        return;
>>>> +    }
>>>>        if ( VM_ASSIST(v->domain, runstate_update_flag) )
>>>>      {
>>>> -        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
>>>> -        guest_handle--;
>>>> -        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>> -        __raw_copy_to_guest(guest_handle,
>>>> -                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
>>>> +        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
>>>>          smp_wmb();
>>> 
>>> Because you set v->runstate.state_entry_time below, the placement of the barrier seems a bit odd.
>>> 
>>> I would suggest to update v->runstate.state_entry_time first and then update runstate->state_entry_time.
>> We do want the guest to know when we modify the runstate so we need to make sure the XEN_RUNSTATE_UPDATE is actually set in a visible way before we do the memcpy.
>> That’s why the barrier is after.
> 
> I think you misunderstood my comment here. I meant to write the following code:
> 
> v->runstate.state_entry_time = ...
> runstate->state_entry_time = ...
> smp_wmb()
> 
> So it is much clearer that the barrier is because runstate->state_entry_time needs to be updated before the memcpy().
> 
>>>> +
>>>> +#ifdef CONFIG_ARM
>>>> +    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
>>> 
>>> A guest is allowed to setup the runstate for a different vCPU than the current one. This will lead to get_page_from_gva() to fail as the function cannot yet work with a vCPU other than current.
>> If the area is mapped per cpu but isn’t the aim of this to have a way to check other cpus status ?
> 
> The aim is to collect how much time a vCPU has been unscheduled. This doesn't have to be run from a different vCPU.
> 
> Anyway, my point is the hypercall allows it today. If you want to make such restriction, then we need to agree on it and document it.
> 
>>> 
>>> AFAICT, there is no restriction on when the runstate hypercall can be called. So this can even be called before the vCPU is brought up.
>> I understand the remark but it still feels very weird to allow an invalid address in an hypercall.
>> Wouldn’t we have a lot of potential issues accepting an address that we cannot check ?
> 
> Well, that's why you see errors when using KPTI ;). If you use a global mapping, then it is not possible to continue without validating the address.
> 
> But to do this, you will have to put restriction on the hypercalls. I would be OK to make such restriction on Arm.
> 
> Cheers,
> 
> -- 
> Julien Grall
Julien Grall May 29, 2020, 2:15 p.m. UTC | #25
On 29/05/2020 15:02, Bertrand Marquis wrote:
> 
> 
>> On 29 May 2020, at 10:43, Julien Grall <julien@xen.org> wrote:
>>
>> Hi Bertrand,
>>
>> On 29/05/2020 09:13, Bertrand Marquis wrote:
>>> Hi Julien,
>>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>>>
>>>> Hi Bertrand,
>>>>
>>>> Thank you for the patch.
>>>>
>>>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>>>> cause the following error when a context switch happens in user mode:
>>>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>>>> This patch is modifying runstate handling to map the area given by the
>>>>> guest inside Xen during the hypercall.
>>>>> This is removing the guest virtual to physical conversion during context
>>>>> switches which removes the bug
>>>>
>>>> It would be good to spell out that a virtual address is not stable. So relying on it is wrong.
>>>>
>>>>> and improve performance by preventing to
>>>>> walk page tables during context switches.
>>>>
>>>> With Secret free hypervisor in mind, I would like to suggest to map/unmap the runstate during context switch.
>>>>
>>>> The cost should be minimal when there is a direct map (i.e on Arm64 and x86) and still provide better performance on Arm32.
>>> Even with a minimal cost this is still adding some non real-time behaviour to the context switch.
>>
>> Just to be clear, by minimal I meant the mapping part is just a virt_to_mfn() call and the unmapping is a NOP.
>>
>> IHMO, if virt_to_mfn() ends up to add non-RT behavior then you have much bigger problem than just this call.
>>
>>> But definitely from the security point of view as we have to map a page from the guest, we could have accessible in Xen some data that should not be there.
>>> There is a trade here where:
>>> - xen can protect by map/unmapping
>>> - a guest which wants to secure his data should either not use it or make sure there is nothing else in the page
>>
>> Both are valid and depends on your setup. One may want to protect all the existing guests, so modifying a guest may not be a solution.
> 
> The fact to map/unmap is increasing the protection but not removing the problem completely.

I would be curious to understand why the problem is not completely removed.

 From my perspective, this covers the case where Xen could leak the 
information of one domain to another domain. When there is no direct 
mapping, temporary mappings via domain_map_page() will be either 
per-pCPU (or per-vCPU). So the content should never be (easily) 
accessible by another running domain while it is mapped.

If the guest is concerned about exposing the data to Xen, then it is a 
completely different issue and should be taken care by the guest iself.

Cheers,
Bertrand Marquis May 29, 2020, 2:21 p.m. UTC | #26
> On 29 May 2020, at 15:15, Julien Grall <julien@xen.org> wrote:
> 
> 
> 
> On 29/05/2020 15:02, Bertrand Marquis wrote:
>>> On 29 May 2020, at 10:43, Julien Grall <julien@xen.org> wrote:
>>> 
>>> Hi Bertrand,
>>> 
>>> On 29/05/2020 09:13, Bertrand Marquis wrote:
>>>> Hi Julien,
>>>>> On 28 May 2020, at 19:54, Julien Grall <julien@xen.org> wrote:
>>>>> 
>>>>> Hi Bertrand,
>>>>> 
>>>>> Thank you for the patch.
>>>>> 
>>>>> On 28/05/2020 16:25, Bertrand Marquis wrote:
>>>>>> At the moment on Arm, a Linux guest running with KTPI enabled will
>>>>>> cause the following error when a context switch happens in user mode:
>>>>>> (XEN) p2m.c:1890: d1v0: Failed to walk page-table va 0xffffff837ebe0cd0
>>>>>> This patch is modifying runstate handling to map the area given by the
>>>>>> guest inside Xen during the hypercall.
>>>>>> This is removing the guest virtual to physical conversion during context
>>>>>> switches which removes the bug
>>>>> 
>>>>> It would be good to spell out that a virtual address is not stable. So relying on it is wrong.
>>>>> 
>>>>>> and improve performance by preventing to
>>>>>> walk page tables during context switches.
>>>>> 
>>>>> With Secret free hypervisor in mind, I would like to suggest to map/unmap the runstate during context switch.
>>>>> 
>>>>> The cost should be minimal when there is a direct map (i.e on Arm64 and x86) and still provide better performance on Arm32.
>>>> Even with a minimal cost this is still adding some non real-time behaviour to the context switch.
>>> 
>>> Just to be clear, by minimal I meant the mapping part is just a virt_to_mfn() call and the unmapping is a NOP.
>>> 
>>> IHMO, if virt_to_mfn() ends up to add non-RT behavior then you have much bigger problem than just this call.
>>> 
>>>> But definitely from the security point of view as we have to map a page from the guest, we could have accessible in Xen some data that should not be there.
>>>> There is a trade here where:
>>>> - xen can protect by map/unmapping
>>>> - a guest which wants to secure his data should either not use it or make sure there is nothing else in the page
>>> 
>>> Both are valid and depends on your setup. One may want to protect all the existing guests, so modifying a guest may not be a solution.
>> The fact to map/unmap is increasing the protection but not removing the problem completely.
> 
> I would be curious to understand why the problem is not completely removed.
> 
> From my perspective, this covers the case where Xen could leak the information of one domain to another domain. When there is no direct mapping, temporary mappings via domain_map_page() will be either per-pCPU (or per-vCPU). So the content should never be (easily) accessible by another running domain while it is mapped.
> 
> If the guest is concerned about exposing the data to Xen, then it is a completely different issue and should be taken care by the guest iself.

Even temporarily mapped you still have access to more then you need so you could potentially modify something from the guest at that point (from an interrupt handler for example).
The attack surface is reduced a lot I agree but if the guest does not make sure that nothing else is available in the page, you can potentially still get an access.

Cheers
Bertrand
Roger Pau Monné May 29, 2020, 2:36 p.m. UTC | #27
On Fri, May 29, 2020 at 02:37:24PM +0100, Julien Grall wrote:
> Hi,
> 
> On 29/05/2020 14:26, Roger Pau Monné wrote:
> > TBH I would just remove the error message on Arm from the current
> > hypercall, I don't think it's useful.
> The message is part of the helpers get_page_from_gva() which is also used by
> copy_{to, from}_guest. While it may not be useful in the context of the
> runstate, it was introduced because there was some other weird bug happening
> before KPTI even existed (see [1]). I haven't yet managed to find the bottom
> line of the issue.
> 
> So I would still very much like to keep the message in place. Although we
> could reduce the number of cases where this is hapenning based on the fault.

Ack, I someone hove wrongly assumed the message was explicitly about
the runstate area, not a generic one in the helpers.

Roger.
diff mbox series

Patch

diff --git a/xen/arch/arm/domain.c b/xen/arch/arm/domain.c
index 31169326b2..799b0e0103 100644
--- a/xen/arch/arm/domain.c
+++ b/xen/arch/arm/domain.c
@@ -278,33 +278,37 @@  static void ctxt_switch_to(struct vcpu *n)
 /* Update per-VCPU guest runstate shared memory area (if registered). */
 static void update_runstate_area(struct vcpu *v)
 {
-    void __user *guest_handle = NULL;
-    struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *runstate;
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
+    /* XXX why do we accept not to block here */
+    if ( !spin_trylock(&v->runstate_guest_lock) )
         return;
 
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
+    runstate = runstate_guest(v);
+
+    if (runstate == NULL)
+    {
+        spin_unlock(&v->runstate_guest_lock);
+        return;
+    }
 
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        guest_handle = &v->runstate_guest.p->state_entry_time + 1;
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
         smp_wmb();
+        v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
     }
 
-    __copy_to_guest(runstate_guest(v), &runstate, 1);
+    memcpy(runstate, &v->runstate, sizeof(v->runstate));
 
-    if ( guest_handle )
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        runstate->state_entry_time &= ~XEN_RUNSTATE_UPDATE;
         smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+        v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
     }
+
+    spin_unlock(&v->runstate_guest_lock);
 }
 
 static void schedule_tail(struct vcpu *prev)
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 6327ba0790..10c6ceb561 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -1642,57 +1642,62 @@  void paravirt_ctxt_switch_to(struct vcpu *v)
 /* Update per-VCPU guest runstate shared memory area (if registered). */
 bool update_runstate_area(struct vcpu *v)
 {
-    bool rc;
     struct guest_memory_policy policy = { .nested_guest_mode = false };
-    void __user *guest_handle = NULL;
-    struct vcpu_runstate_info runstate;
+    struct vcpu_runstate_info *runstate;
 
-    if ( guest_handle_is_null(runstate_guest(v)) )
+    /* XXX: should we return false ? */
+    if ( !spin_trylock(&v->runstate_guest_lock) )
         return true;
 
-    update_guest_memory_policy(v, &policy);
+    runstate = runstate_guest(v);
 
-    memcpy(&runstate, &v->runstate, sizeof(runstate));
+    if (runstate == NULL)
+    {
+        spin_unlock(&v->runstate_guest_lock);
+        return true;
+    }
+
+    update_guest_memory_policy(v, &policy);
 
     if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        guest_handle = has_32bit_shinfo(v->domain)
-            ? &v->runstate_guest.compat.p->state_entry_time + 1
-            : &v->runstate_guest.native.p->state_entry_time + 1;
-        guest_handle--;
-        runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
         smp_wmb();
+        if (has_32bit_shinfo(v->domain))
+            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        else
+            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;
     }
 
     if ( has_32bit_shinfo(v->domain) )
     {
         struct compat_vcpu_runstate_info info;
-
         XLAT_vcpu_runstate_info(&info, &runstate);
-        __copy_to_guest(v->runstate_guest.compat, &info, 1);
-        rc = true;
+        memcpy(v->runstate_guest.compat, &info, 1);
     }
     else
-        rc = __copy_to_guest(runstate_guest(v), &runstate, 1) !=
-             sizeof(runstate);
+        memcpy(runstate, &v->runstate, sizeof(v->runstate));
 
-    if ( guest_handle )
+    if ( VM_ASSIST(v->domain, runstate_update_flag) )
     {
-        runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
+        runstate->state_entry_time |= XEN_RUNSTATE_UPDATE;
         smp_wmb();
-        __raw_copy_to_guest(guest_handle,
-                            (void *)(&runstate.state_entry_time + 1) - 1, 1);
+        if (has_32bit_shinfo(v->domain))
+            v->runstate_guest.compat->state_entry_time |= XEN_RUNSTATE_UPDATE;
+        else
+            v->runstate_guest.native->state_entry_time |= XEN_RUNSTATE_UPDATE;
     }
 
+    spin_unlock(&v->runstate_guest_lock);
+
     update_guest_memory_policy(v, &policy);
 
-    return rc;
+    return true;
 }
 
 static void _update_runstate_area(struct vcpu *v)
 {
+    /* XXX: this should be removed */
     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
          !(v->arch.flags & TF_kernel_mode) )
         v->arch.pv.need_update_runstate_area = 1;
diff --git a/xen/common/domain.c b/xen/common/domain.c
index 7cc9526139..acc6f90ba3 100644
--- a/xen/common/domain.c
+++ b/xen/common/domain.c
@@ -161,6 +161,7 @@  struct vcpu *vcpu_create(struct domain *d, unsigned int vcpu_id)
     v->dirty_cpu = VCPU_CPU_CLEAN;
 
     spin_lock_init(&v->virq_lock);
+    spin_lock_init(&v->runstate_guest_lock);
 
     tasklet_init(&v->continue_hypercall_tasklet, NULL, NULL);
 
@@ -691,6 +692,66 @@  int rcu_lock_live_remote_domain_by_id(domid_t dom, struct domain **d)
     return 0;
 }
 
+static void  unmap_runstate_area(struct vcpu *v, unsigned int lock)
+{
+    mfn_t mfn;
+
+    if ( ! runstate_guest(v) )
+        return;
+
+    if (lock)
+        spin_lock(&v->runstate_guest_lock);
+
+    mfn = domain_page_map_to_mfn(runstate_guest(v));
+
+    unmap_domain_page_global((void *)
+                            ((unsigned long)v->runstate_guest &
+                             PAGE_MASK));
+
+    put_page_and_type(mfn_to_page(mfn));
+    runstate_guest(v) = NULL;
+
+    if (lock)
+        spin_unlock(&v->runstate_guest_lock);
+}
+
+static int map_runstate_area(struct vcpu *v,
+                    struct vcpu_register_runstate_memory_area *area)
+{
+    unsigned long offset = area->addr.p & ~PAGE_MASK;
+    void *mapping;
+    struct page_info *page;
+    size_t size = sizeof(struct vcpu_runstate_info);
+
+    ASSERT(runstate_guest(v) == NULL);
+
+    /* do not allow an area crossing 2 pages */
+    if ( offset > (PAGE_SIZE - size) )
+        return -EINVAL;
+
+#ifdef CONFIG_ARM
+    page = get_page_from_gva(v, area->addr.p, GV2M_WRITE);
+#else
+    /* XXX how to solve this one ? */
+#error get_page_from_gva is not available on other archs
+#endif
+    if ( !page )
+        return -EINVAL;
+
+    mapping = __map_domain_page_global(page);
+
+    if ( mapping == NULL )
+    {
+        put_page_and_type(page);
+        return -ENOMEM;
+    }
+
+    runstate_guest(v) = (struct vcpu_runstate_info *)
+        ((unsigned long)mapping + offset);
+
+    return 0;
+}
+
 int domain_kill(struct domain *d)
 {
     int rc = 0;
@@ -727,7 +788,10 @@  int domain_kill(struct domain *d)
         if ( cpupool_move_domain(d, cpupool0) )
             return -ERESTART;
         for_each_vcpu ( d, v )
+        {
+            unmap_runstate_area(v, 0);
             unmap_vcpu_info(v);
+        }
         d->is_dying = DOMDYING_dead;
         /* Mem event cleanup has to go here because the rings 
          * have to be put before we call put_domain. */
@@ -1167,7 +1231,7 @@  int domain_soft_reset(struct domain *d)
 
     for_each_vcpu ( d, v )
     {
-        set_xen_guest_handle(runstate_guest(v), NULL);
+        unmap_runstate_area(v, 1);
         unmap_vcpu_info(v);
     }
 
@@ -1484,7 +1548,6 @@  long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
     case VCPUOP_register_runstate_memory_area:
     {
         struct vcpu_register_runstate_memory_area area;
-        struct vcpu_runstate_info runstate;
 
         rc = -EFAULT;
         if ( copy_from_guest(&area, arg, 1) )
@@ -1493,18 +1556,13 @@  long do_vcpu_op(int cmd, unsigned int vcpuid, XEN_GUEST_HANDLE_PARAM(void) arg)
         if ( !guest_handle_okay(area.addr.h, 1) )
             break;
 
-        rc = 0;
-        runstate_guest(v) = area.addr.h;
+        spin_lock(&v->runstate_guest_lock);
 
-        if ( v == current )
-        {
-            __copy_to_guest(runstate_guest(v), &v->runstate, 1);
-        }
-        else
-        {
-            vcpu_runstate_get(v, &runstate);
-            __copy_to_guest(runstate_guest(v), &runstate, 1);
-        }
+        unmap_runstate_area(v, 0);
+
+        rc = map_runstate_area(v, &area);
+
+        spin_unlock(&v->runstate_guest_lock);
 
         break;
     }
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index ac53519d7f..ab87182e81 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -166,15 +166,18 @@  struct vcpu
     struct sched_unit *sched_unit;
 
     struct vcpu_runstate_info runstate;
+
+    spinlock_t      runstate_guest_lock;
+
 #ifndef CONFIG_COMPAT
 # define runstate_guest(v) ((v)->runstate_guest)
-    XEN_GUEST_HANDLE(vcpu_runstate_info_t) runstate_guest; /* guest address */
+    vcpu_runstate_info_t *runstate_guest; /* mapped address of guest copy */
 #else
 # define runstate_guest(v) ((v)->runstate_guest.native)
     union {
-        XEN_GUEST_HANDLE(vcpu_runstate_info_t) native;
-        XEN_GUEST_HANDLE(vcpu_runstate_info_compat_t) compat;
-    } runstate_guest; /* guest address */
+        vcpu_runstate_info_t *native;
+        vcpu_runstate_info_compat_t *compat;
+    } runstate_guest; /* mapped address of guest copy */
 #endif
     unsigned int     new_state;