diff mbox

[RFC,6/9] x86/SVM: Add AVIC vmexit handlers

Message ID 1474264368-4104-7-git-send-email-suravee.suthikulpanit@amd.com (mailing list archive)
State New, archived
Headers show

Commit Message

Suravee Suthikulpanit Sept. 19, 2016, 5:52 a.m. UTC
AVIC introduces two #vmexit handlers:
  * VMEXIT_INCOMP_IPI
  * VMEXIT_DO_NOACCEL

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
---
 xen/arch/x86/hvm/svm/avic.c        | 279 +++++++++++++++++++++++++++++++++++++
 xen/arch/x86/hvm/svm/svm.c         |   8 ++
 xen/include/asm-x86/hvm/svm/avic.h |   3 +
 xen/include/asm-x86/hvm/svm/vmcb.h |   2 +
 4 files changed, 292 insertions(+)

Comments

Konrad Rzeszutek Wilk Oct. 14, 2016, 3:20 p.m. UTC | #1
On Mon, Sep 19, 2016 at 12:52:45AM -0500, Suravee Suthikulpanit wrote:
> AVIC introduces two #vmexit handlers:
>   * VMEXIT_INCOMP_IPI
>   * VMEXIT_DO_NOACCEL

Great.. Can you describe what you are suppose to do with them?

Please keep in mind that the point of the commit description is to
say something about the behavior or what not.

You can also just point to the AMD manual, but it would be better
if you included some explanation of why you wrote the code this way
or such.
> 
> Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
> ---
>  xen/arch/x86/hvm/svm/avic.c        | 279 +++++++++++++++++++++++++++++++++++++
>  xen/arch/x86/hvm/svm/svm.c         |   8 ++
>  xen/include/asm-x86/hvm/svm/avic.h |   3 +
>  xen/include/asm-x86/hvm/svm/vmcb.h |   2 +
>  4 files changed, 292 insertions(+)
> 
> diff --git a/xen/arch/x86/hvm/svm/avic.c b/xen/arch/x86/hvm/svm/avic.c
> index 70bac69..90df181 100644
> --- a/xen/arch/x86/hvm/svm/avic.c
> +++ b/xen/arch/x86/hvm/svm/avic.c
> @@ -18,6 +18,7 @@
>  #define AVIC_DOORBELL           0xc001011b
>  #define AVIC_HPA_MASK           ~((0xFFFULL << 52) || 0xFFF)
>  #define AVIC_APIC_BAR_MASK      0xFFFFFFFFFF000ULL
> +#define AVIC_UNACCEL_ACCESS_OFFSET_MASK    0xFF0
>  
>  bool_t svm_avic = 0;
>  boolean_param("svm-avic", svm_avic);
> @@ -215,3 +216,281 @@ int svm_avic_init_vmcb(struct vcpu *v)
>  
>      return 0;
>  }
> +
> +/***************************************************************

That is a lot of stars. Please shrink to one.
> + * AVIC INCOMP IPI VMEXIT

Hmm, I think I figured that out from the name of the function.
Perhaps you want to say what the function is suppose to do or
under what conditions this would be called instead of bouncing
inside the guest?

Or perhaps just say under what conditions we would _not_
enter here and the guest would run on its merry way?

Or if that is explained in details in the comments in switch statements
then just remove the comment altogether?
> + */
> +void svm_avic_vmexit_do_incomp_ipi(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> +    u32 icrh = vmcb->exitinfo1 >> 32;
> +    u32 icrl = vmcb->exitinfo1;
> +    u32 id = vmcb->exitinfo2 >> 32;
> +    u32 index = vmcb->exitinfo2 && 0xFF;
> +
> +    dprintk(XENLOG_DEBUG, "SVM: %s: cpu=%#x, vcpu=%#x, "
> +           "icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
> +           __func__, v->processor, v->vcpu_id, icrh, icrl, id, index);

<raises eyebrows>

Please remove this.
> +
> +    switch ( id )
> +    {
> +    case AVIC_INCMP_IPI_ERR_INVALID_INT_TYPE:
> +        /*
> +         * AVIC hardware handles the generation of

generation? Did you mean 'delivery' ?
> +         * IPIs when the specified Message Type is Fixed
> +         * (also known as fixed delivery mode) and
> +         * the Trigger Mode is edge-triggered. The hardware
> +         * also supports self and broadcast delivery modes
> +         * specified via the Destination Shorthand(DSH)
> +         * field of the ICRL. Logical and physical APIC ID
> +         * formats are supported. All other IPI types cause
> +         * a #VMEXIT, which needs to emulated.
> +         */
> +        vlapic_reg_write(v, APIC_ICR2, icrh);
> +        vlapic_reg_write(v, APIC_ICR, icrl);
> +        break;
> +    case AVIC_INCMP_IPI_ERR_TARGET_NOT_RUN:
> +    {
> +        /*
> +         * At this point, we expect that the AVIC HW has already
> +         * set the appropriate IRR bits on the valid target
> +         * vcpus. So, we just need to kick the appropriate vcpu.

Is there a pattern to this? Meaning does the CPU go sequentially
through the logical APIC ids - which (if say the guest is using
logical APIC ids which gives you pretty much 1-1 to physical) - means
that this VMEXIT would occur in a sequence of the vCPUs that are not
running? Because if so, then ..
> +         */
> +        struct vcpu *c;
> +        struct domain *d = v->domain;
> +        uint32_t dest = GET_xAPIC_DEST_FIELD(icrh);
> +        uint32_t short_hand = icrl & APIC_SHORT_MASK;
> +        bool_t dest_mode = !!(icrl & APIC_DEST_MASK);

bool
> +
> +        for_each_vcpu ( d, c )

.. you could optimize this a bit and start at vCPU+1 (since you know
that this current vCPU most certainyl got the vCPU) ?

> +        {
> +            if ( vlapic_match_dest(vcpu_vlapic(c), vcpu_vlapic(v),
> +                                   short_hand, dest, dest_mode) )
> +            {
> +                vcpu_kick(c);
> +                break;
> +            }
> +        }
> +        break;
> +    }
> +    case AVIC_INCMP_IPI_ERR_INV_TARGET:
> +        dprintk(XENLOG_ERR,
> +                "SVM: %s: Invalid IPI target (icr=%#08x:%08x, idx=%u)\n",
> +                __func__, icrh, icrl, index);

Please remove this dprintk.

> +        break;
> +    case AVIC_INCMP_IPI_ERR_INV_BK_PAGE:
> +        dprintk(XENLOG_ERR,
> +                "SVM: %s: Invalid bk page (icr=%#08x:%08x, idx=%u)\n",
> +                __func__, icrh, icrl, index);

That should never happen right? If it does that means we messed up the
allocation somehow. If so, should we disable AVIC for this guest
completly and log this error?

Not exactly sure what that means - the manual says that once you enable
AVIC you MUST keep the structures for the life of the guest.

And nothing about what happens if you decide for fun to disable AVIC and
enable it at random times.


> +        break;
> +    default:
> +        dprintk(XENLOG_ERR, "SVM: %s: Unknown IPI interception\n", __func__);

Perhaps include the value as well? But regardless of this  -if we do
get this, should we disable AVIC?

> +    }
> +}
> +
> +/***************************************************************

Star explosion!

> + * AVIC NOACCEL VMEXIT

I think you know what I will say about that sparse comment :-)

> + */
> +#define GET_APIC_LOGICAL_ID(x)        (((x) >> 24) & 0xFFu)

I naively assumed that this macro would be in the code but not so.

Perhaps you can add it in apic.h file? There is an SET_APIC_LOGICAL_ID
so right next to it would be good?

> +
> +static struct svm_avic_log_ait_entry *
> +avic_get_logical_id_entry(struct vcpu *v, u32 ldr, bool flat)

const struct vcpu *v
> +{
> +    int index;

unsigned int?
> +    struct svm_avic_log_ait_entry *avic_log_ait;
> +    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
> +    int dlid = GET_APIC_LOGICAL_ID(ldr);

unsigned int?

'dlid'? How about 'dest_id' ?

> +
> +    if ( !dlid )
> +        return NULL;
> +
> +    if ( flat )
> +    {
> +        index = ffs(dlid) - 1;
> +        if ( index > 7 )
> +            return NULL;
> +    }
> +    else
> +    {
> +        int cluster = (dlid & 0xf0) >> 4;

unsigned int
> +        int apic = ffs(dlid & 0x0f) - 1;
> +
> +        if ((apic < 0) || (apic > 7) || (cluster >= 0xf))

Something is off with the (). You need some spaces:

 if ( (apic < 0) ..
> +            return NULL;
> +        index = (cluster << 2) + apic;
> +    }
> +
> +    avic_log_ait = mfn_to_virt(d->avic_log_ait_mfn);
> +

Would it make sense to add:

     ASSERT(index <= 255) ?

> +    return &avic_log_ait[index];
> +}
> +
> +static int avic_ldr_write(struct vcpu *v, u8 g_phy_id, u32 ldr, bool valid)
> +{
> +    bool flat;
> +    struct svm_avic_log_ait_entry *entry, new_entry;
> +
> +    flat = *avic_get_bk_page_entry(v, APIC_DFR) == APIC_DFR_FLAT;

If my recollection is right, avic_get_bk_page_entry can return NULL? So
you really don't want to dereference it right away here.

> +    entry = avic_get_logical_id_entry(v, ldr, flat);
> +    if (!entry)
> +        return -EINVAL;
> +
> +    new_entry = *entry;
> +    smp_rmb();
> +    new_entry.guest_phy_apic_id = g_phy_id;
> +    new_entry.valid = valid;
> +    *entry = new_entry;
> +    smp_wmb();
> +
> +    return 0;
> +}
> +
> +static int avic_handle_ldr_update(struct vcpu *v)
> +{
> +    int ret = 0;
> +    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
> +    u32 ldr = *avic_get_bk_page_entry(v, APIC_LDR);

Again, this can be NULL.
> +    u32 apic_id = (*avic_get_bk_page_entry(v, APIC_ID) >> 24);

Ditto.
> +
> +    if ( !ldr )
> +        return 1;

Uh? Perhaps you can put a comment at the start of the function
to talk about the return values?

I would expect this return to be -EINVAL?
> +
> +    ret = avic_ldr_write(v, apic_id, ldr, true);
> +    if (ret && d->ldr_reg)

You need spaces after ( and before ).

> +    {
> +        avic_ldr_write(v, 0, d->ldr_reg, false);

Can you add a comment for the 0 and 'false'.

Actually can you explain a bit about the ldr vs ldr_reg values?
It looks like it contains the last LDR value .. but perhaps it is
also set somewhere else?

> +        d->ldr_reg = 0;
> +    }
> +    else
> +    {
> +        d->ldr_reg = ldr;
> +    }
> +
> +    return ret;
> +}
> +
> +static int avic_handle_apic_id_update(struct vcpu *v, bool init)
> +{
> +    struct arch_svm_struct *s = &v->arch.hvm_svm;
> +    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
> +    u32 apic_id_reg = *avic_get_bk_page_entry(v, APIC_ID);

Again, this can return NULL..
> +    u32 id = (apic_id_reg >> 24) & 0xff;

That could use that macro you declared?

> +   struct svm_avic_phy_ait_entry *old, *new;

Something is off with the spaces above?
> +
> +   old = s->avic_phy_id_cache; 
> +   new = avic_get_phy_ait_entry(v, id);
> +   if ( !new || !old )

Uh, 'old' says cache. That would imply that having it as NULL would be
OK?

> +       return 0;
> +
> +   /* We need to move physical_id_entry to new offset */
> +   *new = *old;
> +   *((u64 *)old) = 0ULL;
> +   s->avic_phy_id_cache = new;
> +
> +    /*
> +     * Also update the guest physical APIC ID in the logical

s/Also//
> +     * APIC ID table entry if already setup the LDR.

s/already setup the LDR/LDR is already setup/

?
> +     */
> +    if ( d->ldr_reg )
> +        avic_handle_ldr_update(v);

> +
> +    return 0;
> +}
> +
> +static int avic_handle_dfr_update(struct vcpu *v)

The return value does not seem to be used? Perhaps make it void?
> +{
> +    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
> +    u32 dfr = *avic_get_bk_page_entry(v, APIC_DFR);

Again, this can be NULL.

> +    u32 mod = (dfr >> 28) & 0xf;
> +
> +    /*
> +     * We assume that all local APICs are using the same type.
> +     * If this changes, we need to flush the AVIC logical
> +     * APID id table.
> +     */
> +    if ( d->ldr_mode == mod )
> +        return 0;
> +
> +    clear_domain_page(_mfn(d->avic_log_ait_mfn));

Whoa. What if another vCPU is using this (aka handling another AVIC
access?)? I see that that the 'V' bit would be cleared so the CPU
would trap .. (thought that depends right - the clear_domain_page is
being done in a sequence so some of the later entries could be valid
while the CPU is clearing it).

Perhaps you should iterate over all the 'V' bit first (to clear them)
and then clear the page using the clear_domain_page?
Or better - turn of AVIC first (for the guest), until the page has been cleared?

Either way I think you also need:

	smp_wmb()

here as a memory barrier to flush the changes out.


> +    d->ldr_mode = mod;
> +    if (d->ldr_reg)

Spaces.
> +        avic_handle_ldr_update(v);
> +    return 0;
> +}
> +
> +static int avic_unaccel_trap_write(struct vcpu *v)

unsigned int?

> +{
> +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> +    u32 offset = vmcb->exitinfo1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK;
> +    u32 reg = *avic_get_bk_page_entry(v, offset);

Again, please check for NULL?

> +
> +    switch ( offset ) {

I think the {
is on its own line.

> +    case APIC_ID:
> +        if ( avic_handle_apic_id_update(v, false) )
> +            return 0;
> +        break;
> +    case APIC_LDR:
> +        if ( avic_handle_ldr_update(v) )
> +            return 0;
> +        break;
> +    case APIC_DFR:
> +        avic_handle_dfr_update(v);

You aren't checking its return value?
> +        break;
> +    default:
> +        break;
> +    }
> +
> +    vlapic_reg_write(v, offset, reg);
> +
> +    return 1;
> +}
> +
> +void svm_avic_vmexit_do_noaccel(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;
> +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> +    u32 offset = vmcb->exitinfo1 & 0xFF0;
> +    u32 rw = (vmcb->exitinfo1 >> 32) & 0x1;
> +    u32 vector = vmcb->exitinfo2 & 0xFFFFFFFF;
> +
> +    dprintk(XENLOG_DEBUG,
> +           "SVM: %s: offset=%#x, rw=%#x, vector=%#x, vcpu_id=%#x, cpu=%#x\n",
> +           __func__, offset, rw, vector, v->vcpu_id, v->processor);
> +

Please remove that.
> +    switch(offset)

Missing spaces.

> +    {
> +    case APIC_ID:
> +    case APIC_EOI:
> +    case APIC_RRR:
> +    case APIC_LDR:
> +    case APIC_DFR:
> +    case APIC_SPIV:
> +    case APIC_ESR:
> +    case APIC_ICR:
> +    case APIC_LVTT:
> +    case APIC_LVTTHMR:
> +    case APIC_LVTPC:
> +    case APIC_LVT0:
> +    case APIC_LVT1:
> +    case APIC_LVTERR:
> +    case APIC_TMICT:
> +    case APIC_TDCR:
> +        /* Handling Trap */
> +        if ( !rw )
> +            /* Trap read should never happens */

If it did that would be a truly messed up CPU? You may want to say that:

'If a read trap happens the CPU microcode does not implement the spec.'

> +            BUG();
> +        avic_unaccel_trap_write(v);

No checking of the function return value?

Say the values are all messed up (the guest provides the wrong
APIC ID).. Shouldn't we inject some type of exception in the guest?
(Like the hardware would do? Actually - would the hardware send any type
of interrupt if one messes up with the APIC? Or is it that it sets an
error bit in the APIC?)

> +        break;
> +    default:
> +        /* Handling Fault */

Which kinds?
> +        if ( !rw )
> +            *avic_get_bk_page_entry(v, offset) = vlapic_read_aligned(
> +                                                        vcpu_vlapic(v), offset);
> +
> +        hvm_mem_access_emulate_one(EMUL_KIND_NORMAL, TRAP_invalid_op,
> +                                       HVM_DELIVER_NO_ERROR_CODE);

Odd, something is wrong with the spaces here. The HVM_DELIEV.. is not on
the same line?

So we update the backing page .. and then inject an invalid_op in the
guest? Is that how hardware does it?
> +    }
> +
> +    return;
> +}
> diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
> index bcb7df4..409096a 100644
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -2710,6 +2710,14 @@ void svm_vmexit_handler(struct cpu_user_regs *regs)
>          svm_vmexit_do_pause(regs);
>          break;
>  
> +    case VMEXIT_AVIC_INCOMP_IPI:
> +        svm_avic_vmexit_do_incomp_ipi(regs);
> +        break;
> +
> +    case VMEXIT_AVIC_NOACCEL:
> +        svm_avic_vmexit_do_noaccel(regs);
> +        break;
> +
>      default:
>      unexpected_exit_type:
>          gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = %#"PRIx64", "
> diff --git a/xen/include/asm-x86/hvm/svm/avic.h b/xen/include/asm-x86/hvm/svm/avic.h
> index 9508486..2c501d4 100644
> --- a/xen/include/asm-x86/hvm/svm/avic.h
> +++ b/xen/include/asm-x86/hvm/svm/avic.h
> @@ -37,4 +37,7 @@ bool_t svm_avic_vcpu_enabled(struct vcpu *v);
>  void svm_avic_update_vapic_bar(struct vcpu *v,uint64_t data);
>  int svm_avic_init_vmcb(struct vcpu *v);
>  
> +void svm_avic_vmexit_do_incomp_ipi(struct cpu_user_regs *regs);
> +void svm_avic_vmexit_do_noaccel(struct cpu_user_regs *regs);
> +
>  #endif /* _SVM_AVIC_H_ */
> diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h
> index a42c034..23eb86b 100644
> --- a/xen/include/asm-x86/hvm/svm/vmcb.h
> +++ b/xen/include/asm-x86/hvm/svm/vmcb.h
> @@ -302,6 +302,8 @@ enum VMEXIT_EXITCODE
>      VMEXIT_MWAIT_CONDITIONAL= 140, /* 0x8c */
>      VMEXIT_XSETBV           = 141, /* 0x8d */
>      VMEXIT_NPF              = 1024, /* 0x400, nested paging fault */
> +    VMEXIT_AVIC_INCOMP_IPI  = 1025, /* 0x401 */
> +    VMEXIT_AVIC_NOACCEL     = 1026, /* 0x402 */
>      VMEXIT_INVALID          =  -1
>  };
>  
> -- 
> 1.9.1
> 
> 
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> https://lists.xen.org/xen-devel
Suravee Suthikulpanit Dec. 12, 2016, 10:34 a.m. UTC | #2
Hi Konrad,

Thanks for review comments.

On 10/14/2016 10:20 PM, Konrad Rzeszutek Wilk wrote:
> On Mon, Sep 19, 2016 at 12:52:45AM -0500, Suravee Suthikulpanit wrote:
>> AVIC introduces two #vmexit handlers:
>>
>> +         * IPIs when the specified Message Type is Fixed
>> +         * (also known as fixed delivery mode) and
>> +         * the Trigger Mode is edge-triggered. The hardware
>> +         * also supports self and broadcast delivery modes
>> +         * specified via the Destination Shorthand(DSH)
>> +         * field of the ICRL. Logical and physical APIC ID
>> +         * formats are supported. All other IPI types cause
>> +         * a #VMEXIT, which needs to emulated.
>> +         */
>> +        vlapic_reg_write(v, APIC_ICR2, icrh);
>> +        vlapic_reg_write(v, APIC_ICR, icrl);
>> +        break;
>> +    case AVIC_INCMP_IPI_ERR_TARGET_NOT_RUN:
>> +    {
>> +        /*
>> +         * At this point, we expect that the AVIC HW has already
>> +         * set the appropriate IRR bits on the valid target
>> +         * vcpus. So, we just need to kick the appropriate vcpu.
>
> Is there a pattern to this? Meaning does the CPU go sequentially
> through the logical APIC ids - which (if say the guest is using
> logical APIC ids which gives you pretty much 1-1 to physical) - means
> that this VMEXIT would occur in a sequence of the vCPUs that are not
> running?

Not sure if I follow this part. Typically, the current vcpu (the one 
that handles this #vmexit) is the one initiating IPI. Here we check the 
destination and try to kick each one of the matching destination.

> Because if so, then ..
>> +
>> +        for_each_vcpu ( d, c )
>
> .. you could optimize this a bit and start at vCPU+1 (since you know
> that this current vCPU most certainyl got the vCPU) ?
>

But the current (running) vcpu is not necessary the d->vcpu[0]. Do you 
mean checking if "c == current" in this loop and skip the current vcpu?

>> +        break;
>> +    case AVIC_INCMP_IPI_ERR_INV_BK_PAGE:
>> +        dprintk(XENLOG_ERR,
>> +                "SVM: %s: Invalid bk page (icr=%#08x:%08x, idx=%u)\n",
>> +                __func__, icrh, icrl, index);
>
> That should never happen right? If it does that means we messed up the
> allocation somehow. If so, should we disable AVIC for this guest
> completly and log this error?

I think it depends on when this happens. It might be hard to determine 
the state of all VCPUs at that point. Shouldn't we just crash the domain 
(probably for AVIC_INCMP_IPI_ERR_INV_TARGET and default case as well)?

>> [... in arch/x86/hvm/svm/avic.c: avic_handle_dfr_update()]
>> +    u32 mod = (dfr >> 28) & 0xf;
>> +
>> +    /*
>> +     * We assume that all local APICs are using the same type.
>> +     * If this changes, we need to flush the AVIC logical
>> +     * APID id table.
>> +     */
>> +    if ( d->ldr_mode == mod )
>> +        return 0;
>> +
>> +    clear_domain_page(_mfn(d->avic_log_ait_mfn));
>
> Whoa. What if another vCPU is using this (aka handling another AVIC
> access?)?

Generally, at least in Linux case, I can see that the kernel switch APIC 
routing from cluster to flat early in the boot process prior to setting 
LDR and bringing up the rest of the AP cores). Do you know of any cases 
where the guest OS could be switching the APIC routing mode while the AP 
cores are running?

> I see that that the 'V' bit would be cleared so the CPU
> would trap .. (thought that depends right - the clear_domain_page is
> being done in a sequence so some of the later entries could be valid
> while the CPU is clearing it).

Which "V" bit are you referring to here? And when is it cleared?

> Perhaps you should iterate over all the 'V' bit first (to clear them)
> and then clear the page using the clear_domain_page?
> Or better - turn of AVIC first (for the guest), until the page has been cleared?

What about adding a check to see if DFR is changed after LDR has already 
been setup and throw some error or warning message for now?

>> [... in arch/x86/hvm/svm/avic.c: svm_avic_vmexit_do_noaccel()]
>> +            BUG();
>> +        avic_unaccel_trap_write(v);
>
> Say the values are all messed up (the guest provides the wrong
> APIC ID).. Shouldn't we inject some type of exception in the guest?
> (Like the hardware would do? Actually - would the hardware send any type
> of interrupt if one messes up with the APIC? Or is it that it sets an
> error bit in the APIC?)

IIUC, typically, APIC generates APIC error interrupts and set the APIC 
Error Status Register (ESR) when detect errors during interrupt 
handling. However, if the APIC registers are not programmed correctly, I 
think the system would just fail to boot and go into undefined state.

>> [...]
>> +        if ( !rw )
>> +            *avic_get_bk_page_entry(v, offset) = vlapic_read_aligned(
>> +                                                        vcpu_vlapic(v), offset);
>> +
>> +        hvm_mem_access_emulate_one(EMUL_KIND_NORMAL, TRAP_invalid_op,
>> +                                       HVM_DELIVER_NO_ERROR_CODE);
>
> So we update the backing page .. and then inject an invalid_op in the
> guest? Is that how hardware does it?

Looking into the hvm_mem_access_emulate_one(), my understanding is that 
this emulates the mem access instruction (e.g. to read/write APIC 
register), and inject TRAP_invalid_op when the emulation fails (i.e. 
X86EMUL_UNHANDLEABLE). Am I missing something here?

Thanks,
Suravee
Jan Beulich Dec. 22, 2016, 11:25 a.m. UTC | #3
>>> On 19.09.16 at 07:52, <suravee.suthikulpanit@amd.com> wrote:
> +void svm_avic_vmexit_do_incomp_ipi(struct cpu_user_regs *regs)
> +{
> +    struct vcpu *v = current;

Please name such variables "curr", which at once avoids the need for
the unusual name ...

> +    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
> +    u32 icrh = vmcb->exitinfo1 >> 32;
> +    u32 icrl = vmcb->exitinfo1;
> +    u32 id = vmcb->exitinfo2 >> 32;
> +    u32 index = vmcb->exitinfo2 && 0xFF;
> +
> +    dprintk(XENLOG_DEBUG, "SVM: %s: cpu=%#x, vcpu=%#x, "
> +           "icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
> +           __func__, v->processor, v->vcpu_id, icrh, icrl, id, index);
> +
> +    switch ( id )
> +    {
> +    case AVIC_INCMP_IPI_ERR_INVALID_INT_TYPE:
> +        /*
> +         * AVIC hardware handles the generation of
> +         * IPIs when the specified Message Type is Fixed
> +         * (also known as fixed delivery mode) and
> +         * the Trigger Mode is edge-triggered. The hardware
> +         * also supports self and broadcast delivery modes
> +         * specified via the Destination Shorthand(DSH)
> +         * field of the ICRL. Logical and physical APIC ID
> +         * formats are supported. All other IPI types cause
> +         * a #VMEXIT, which needs to emulated.
> +         */
> +        vlapic_reg_write(v, APIC_ICR2, icrh);
> +        vlapic_reg_write(v, APIC_ICR, icrl);
> +        break;
> +    case AVIC_INCMP_IPI_ERR_TARGET_NOT_RUN:
> +    {
> +        /*
> +         * At this point, we expect that the AVIC HW has already
> +         * set the appropriate IRR bits on the valid target
> +         * vcpus. So, we just need to kick the appropriate vcpu.
> +         */
> +        struct vcpu *c;

here.

> +        struct domain *d = v->domain;

(This would then become currd.)

> +/***************************************************************
> + * AVIC NOACCEL VMEXIT
> + */
> +#define GET_APIC_LOGICAL_ID(x)        (((x) >> 24) & 0xFFu)

GET_xAPIC_ID()?

Jan
Konrad Rzeszutek Wilk Jan. 7, 2017, 1:24 a.m. UTC | #4
On Mon, Dec 12, 2016 at 05:34:29PM +0700, Suravee Suthikulpanit wrote:
> Hi Konrad,
> 
> Thanks for review comments.
> 
> On 10/14/2016 10:20 PM, Konrad Rzeszutek Wilk wrote:
> > On Mon, Sep 19, 2016 at 12:52:45AM -0500, Suravee Suthikulpanit wrote:
> > > AVIC introduces two #vmexit handlers:
> > > 
> > > +         * IPIs when the specified Message Type is Fixed
> > > +         * (also known as fixed delivery mode) and
> > > +         * the Trigger Mode is edge-triggered. The hardware
> > > +         * also supports self and broadcast delivery modes
> > > +         * specified via the Destination Shorthand(DSH)
> > > +         * field of the ICRL. Logical and physical APIC ID
> > > +         * formats are supported. All other IPI types cause
> > > +         * a #VMEXIT, which needs to emulated.
> > > +         */
> > > +        vlapic_reg_write(v, APIC_ICR2, icrh);
> > > +        vlapic_reg_write(v, APIC_ICR, icrl);
> > > +        break;
> > > +    case AVIC_INCMP_IPI_ERR_TARGET_NOT_RUN:
> > > +    {
> > > +        /*
> > > +         * At this point, we expect that the AVIC HW has already
> > > +         * set the appropriate IRR bits on the valid target
> > > +         * vcpus. So, we just need to kick the appropriate vcpu.
> > 
> > Is there a pattern to this? Meaning does the CPU go sequentially
> > through the logical APIC ids - which (if say the guest is using
> > logical APIC ids which gives you pretty much 1-1 to physical) - means
> > that this VMEXIT would occur in a sequence of the vCPUs that are not
> > running?
> 
> Not sure if I follow this part. Typically, the current vcpu (the one that
> handles this #vmexit) is the one initiating IPI. Here we check the
> destination and try to kick each one of the matching destination.
> 
> > Because if so, then ..
> > > +
> > > +        for_each_vcpu ( d, c )
> > 
> > .. you could optimize this a bit and start at vCPU+1 (since you know
> > that this current vCPU most certainyl got the vCPU) ?
> > 
> 
> But the current (running) vcpu is not necessary the d->vcpu[0]. Do you mean
> checking if "c == current" in this loop and skip the current vcpu?

Yes.
> 
> > > +        break;
> > > +    case AVIC_INCMP_IPI_ERR_INV_BK_PAGE:
> > > +        dprintk(XENLOG_ERR,
> > > +                "SVM: %s: Invalid bk page (icr=%#08x:%08x, idx=%u)\n",
> > > +                __func__, icrh, icrl, index);
> > 
> > That should never happen right? If it does that means we messed up the
> > allocation somehow. If so, should we disable AVIC for this guest
> > completly and log this error?
> 
> I think it depends on when this happens. It might be hard to determine the
> state of all VCPUs at that point. Shouldn't we just crash the domain
> (probably for AVIC_INCMP_IPI_ERR_INV_TARGET and default case as well)?

That would be much better
> 
> > > [... in arch/x86/hvm/svm/avic.c: avic_handle_dfr_update()]
> > > +    u32 mod = (dfr >> 28) & 0xf;
> > > +
> > > +    /*
> > > +     * We assume that all local APICs are using the same type.
> > > +     * If this changes, we need to flush the AVIC logical
> > > +     * APID id table.
> > > +     */
> > > +    if ( d->ldr_mode == mod )
> > > +        return 0;
> > > +
> > > +    clear_domain_page(_mfn(d->avic_log_ait_mfn));
> > 
> > Whoa. What if another vCPU is using this (aka handling another AVIC
> > access?)?
> 
> Generally, at least in Linux case, I can see that the kernel switch APIC
> routing from cluster to flat early in the boot process prior to setting LDR
> and bringing up the rest of the AP cores). Do you know of any cases where
> the guest OS could be switching the APIC routing mode while the AP cores are
> running?

Put yourself in the shoes of an attacker. IF there is some way we can
screw up the hypervisor the better. Not sure if clearing this page
would lead us in the hypervisor to crash or such.

> 
> > I see that that the 'V' bit would be cleared so the CPU
> > would trap .. (thought that depends right - the clear_domain_page is
> > being done in a sequence so some of the later entries could be valid
> > while the CPU is clearing it).
> 
> Which "V" bit are you referring to here? And when is it cleared?

I sadly don't remember :-(
> 
> > Perhaps you should iterate over all the 'V' bit first (to clear them)
> > and then clear the page using the clear_domain_page?
> > Or better - turn of AVIC first (for the guest), until the page has been cleared?
> 
> What about adding a check to see if DFR is changed after LDR has already
> been setup and throw some error or warning message for now?

Sure.
> 
> > > [... in arch/x86/hvm/svm/avic.c: svm_avic_vmexit_do_noaccel()]
> > > +            BUG();
> > > +        avic_unaccel_trap_write(v);
> > 
> > Say the values are all messed up (the guest provides the wrong
> > APIC ID).. Shouldn't we inject some type of exception in the guest?
> > (Like the hardware would do? Actually - would the hardware send any type
> > of interrupt if one messes up with the APIC? Or is it that it sets an
> > error bit in the APIC?)
> 
> IIUC, typically, APIC generates APIC error interrupts and set the APIC Error
> Status Register (ESR) when detect errors during interrupt handling. However,
> if the APIC registers are not programmed correctly, I think the system would
> just fail to boot and go into undefined state.


As long as we don't crash the hypervisor I suppose that is OK.
> 
> > > [...]
> > > +        if ( !rw )
> > > +            *avic_get_bk_page_entry(v, offset) = vlapic_read_aligned(
> > > +                                                        vcpu_vlapic(v), offset);
> > > +
> > > +        hvm_mem_access_emulate_one(EMUL_KIND_NORMAL, TRAP_invalid_op,
> > > +                                       HVM_DELIVER_NO_ERROR_CODE);
> > 
> > So we update the backing page .. and then inject an invalid_op in the
> > guest? Is that how hardware does it?
> 
> Looking into the hvm_mem_access_emulate_one(), my understanding is that this
> emulates the mem access instruction (e.g. to read/write APIC register), and
> inject TRAP_invalid_op when the emulation fails (i.e. X86EMUL_UNHANDLEABLE).
> Am I missing something here?

Nope. Just me misremembering it.

> 
> Thanks,
> Suravee
diff mbox

Patch

diff --git a/xen/arch/x86/hvm/svm/avic.c b/xen/arch/x86/hvm/svm/avic.c
index 70bac69..90df181 100644
--- a/xen/arch/x86/hvm/svm/avic.c
+++ b/xen/arch/x86/hvm/svm/avic.c
@@ -18,6 +18,7 @@ 
 #define AVIC_DOORBELL           0xc001011b
 #define AVIC_HPA_MASK           ~((0xFFFULL << 52) || 0xFFF)
 #define AVIC_APIC_BAR_MASK      0xFFFFFFFFFF000ULL
+#define AVIC_UNACCEL_ACCESS_OFFSET_MASK    0xFF0
 
 bool_t svm_avic = 0;
 boolean_param("svm-avic", svm_avic);
@@ -215,3 +216,281 @@  int svm_avic_init_vmcb(struct vcpu *v)
 
     return 0;
 }
+
+/***************************************************************
+ * AVIC INCOMP IPI VMEXIT
+ */
+void svm_avic_vmexit_do_incomp_ipi(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    u32 icrh = vmcb->exitinfo1 >> 32;
+    u32 icrl = vmcb->exitinfo1;
+    u32 id = vmcb->exitinfo2 >> 32;
+    u32 index = vmcb->exitinfo2 && 0xFF;
+
+    dprintk(XENLOG_DEBUG, "SVM: %s: cpu=%#x, vcpu=%#x, "
+           "icrh:icrl=%#010x:%08x, id=%u, index=%u\n",
+           __func__, v->processor, v->vcpu_id, icrh, icrl, id, index);
+
+    switch ( id )
+    {
+    case AVIC_INCMP_IPI_ERR_INVALID_INT_TYPE:
+        /*
+         * AVIC hardware handles the generation of
+         * IPIs when the specified Message Type is Fixed
+         * (also known as fixed delivery mode) and
+         * the Trigger Mode is edge-triggered. The hardware
+         * also supports self and broadcast delivery modes
+         * specified via the Destination Shorthand(DSH)
+         * field of the ICRL. Logical and physical APIC ID
+         * formats are supported. All other IPI types cause
+         * a #VMEXIT, which needs to emulated.
+         */
+        vlapic_reg_write(v, APIC_ICR2, icrh);
+        vlapic_reg_write(v, APIC_ICR, icrl);
+        break;
+    case AVIC_INCMP_IPI_ERR_TARGET_NOT_RUN:
+    {
+        /*
+         * At this point, we expect that the AVIC HW has already
+         * set the appropriate IRR bits on the valid target
+         * vcpus. So, we just need to kick the appropriate vcpu.
+         */
+        struct vcpu *c;
+        struct domain *d = v->domain;
+        uint32_t dest = GET_xAPIC_DEST_FIELD(icrh);
+        uint32_t short_hand = icrl & APIC_SHORT_MASK;
+        bool_t dest_mode = !!(icrl & APIC_DEST_MASK);
+
+        for_each_vcpu ( d, c )
+        {
+            if ( vlapic_match_dest(vcpu_vlapic(c), vcpu_vlapic(v),
+                                   short_hand, dest, dest_mode) )
+            {
+                vcpu_kick(c);
+                break;
+            }
+        }
+        break;
+    }
+    case AVIC_INCMP_IPI_ERR_INV_TARGET:
+        dprintk(XENLOG_ERR,
+                "SVM: %s: Invalid IPI target (icr=%#08x:%08x, idx=%u)\n",
+                __func__, icrh, icrl, index);
+        break;
+    case AVIC_INCMP_IPI_ERR_INV_BK_PAGE:
+        dprintk(XENLOG_ERR,
+                "SVM: %s: Invalid bk page (icr=%#08x:%08x, idx=%u)\n",
+                __func__, icrh, icrl, index);
+        break;
+    default:
+        dprintk(XENLOG_ERR, "SVM: %s: Unknown IPI interception\n", __func__);
+    }
+}
+
+/***************************************************************
+ * AVIC NOACCEL VMEXIT
+ */
+#define GET_APIC_LOGICAL_ID(x)        (((x) >> 24) & 0xFFu)
+
+static struct svm_avic_log_ait_entry *
+avic_get_logical_id_entry(struct vcpu *v, u32 ldr, bool flat)
+{
+    int index;
+    struct svm_avic_log_ait_entry *avic_log_ait;
+    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
+    int dlid = GET_APIC_LOGICAL_ID(ldr);
+
+    if ( !dlid )
+        return NULL;
+
+    if ( flat )
+    {
+        index = ffs(dlid) - 1;
+        if ( index > 7 )
+            return NULL;
+    }
+    else
+    {
+        int cluster = (dlid & 0xf0) >> 4;
+        int apic = ffs(dlid & 0x0f) - 1;
+
+        if ((apic < 0) || (apic > 7) || (cluster >= 0xf))
+            return NULL;
+        index = (cluster << 2) + apic;
+    }
+
+    avic_log_ait = mfn_to_virt(d->avic_log_ait_mfn);
+
+    return &avic_log_ait[index];
+}
+
+static int avic_ldr_write(struct vcpu *v, u8 g_phy_id, u32 ldr, bool valid)
+{
+    bool flat;
+    struct svm_avic_log_ait_entry *entry, new_entry;
+
+    flat = *avic_get_bk_page_entry(v, APIC_DFR) == APIC_DFR_FLAT;
+    entry = avic_get_logical_id_entry(v, ldr, flat);
+    if (!entry)
+        return -EINVAL;
+
+    new_entry = *entry;
+    smp_rmb();
+    new_entry.guest_phy_apic_id = g_phy_id;
+    new_entry.valid = valid;
+    *entry = new_entry;
+    smp_wmb();
+
+    return 0;
+}
+
+static int avic_handle_ldr_update(struct vcpu *v)
+{
+    int ret = 0;
+    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
+    u32 ldr = *avic_get_bk_page_entry(v, APIC_LDR);
+    u32 apic_id = (*avic_get_bk_page_entry(v, APIC_ID) >> 24);
+
+    if ( !ldr )
+        return 1;
+
+    ret = avic_ldr_write(v, apic_id, ldr, true);
+    if (ret && d->ldr_reg)
+    {
+        avic_ldr_write(v, 0, d->ldr_reg, false);
+        d->ldr_reg = 0;
+    }
+    else
+    {
+        d->ldr_reg = ldr;
+    }
+
+    return ret;
+}
+
+static int avic_handle_apic_id_update(struct vcpu *v, bool init)
+{
+    struct arch_svm_struct *s = &v->arch.hvm_svm;
+    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
+    u32 apic_id_reg = *avic_get_bk_page_entry(v, APIC_ID);
+    u32 id = (apic_id_reg >> 24) & 0xff;
+   struct svm_avic_phy_ait_entry *old, *new;
+
+   old = s->avic_phy_id_cache; 
+   new = avic_get_phy_ait_entry(v, id);
+   if ( !new || !old )
+       return 0;
+
+   /* We need to move physical_id_entry to new offset */
+   *new = *old;
+   *((u64 *)old) = 0ULL;
+   s->avic_phy_id_cache = new;
+
+    /*
+     * Also update the guest physical APIC ID in the logical
+     * APIC ID table entry if already setup the LDR.
+     */
+    if ( d->ldr_reg )
+        avic_handle_ldr_update(v);
+
+    return 0;
+}
+
+static int avic_handle_dfr_update(struct vcpu *v)
+{
+    struct svm_domain *d = &v->domain->arch.hvm_domain.svm;
+    u32 dfr = *avic_get_bk_page_entry(v, APIC_DFR);
+    u32 mod = (dfr >> 28) & 0xf;
+
+    /*
+     * We assume that all local APICs are using the same type.
+     * If this changes, we need to flush the AVIC logical
+     * APID id table.
+     */
+    if ( d->ldr_mode == mod )
+        return 0;
+
+    clear_domain_page(_mfn(d->avic_log_ait_mfn));
+    d->ldr_mode = mod;
+    if (d->ldr_reg)
+        avic_handle_ldr_update(v);
+    return 0;
+}
+
+static int avic_unaccel_trap_write(struct vcpu *v)
+{
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    u32 offset = vmcb->exitinfo1 & AVIC_UNACCEL_ACCESS_OFFSET_MASK;
+    u32 reg = *avic_get_bk_page_entry(v, offset);
+
+    switch ( offset ) {
+    case APIC_ID:
+        if ( avic_handle_apic_id_update(v, false) )
+            return 0;
+        break;
+    case APIC_LDR:
+        if ( avic_handle_ldr_update(v) )
+            return 0;
+        break;
+    case APIC_DFR:
+        avic_handle_dfr_update(v);
+        break;
+    default:
+        break;
+    }
+
+    vlapic_reg_write(v, offset, reg);
+
+    return 1;
+}
+
+void svm_avic_vmexit_do_noaccel(struct cpu_user_regs *regs)
+{
+    struct vcpu *v = current;
+    struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
+    u32 offset = vmcb->exitinfo1 & 0xFF0;
+    u32 rw = (vmcb->exitinfo1 >> 32) & 0x1;
+    u32 vector = vmcb->exitinfo2 & 0xFFFFFFFF;
+
+    dprintk(XENLOG_DEBUG,
+           "SVM: %s: offset=%#x, rw=%#x, vector=%#x, vcpu_id=%#x, cpu=%#x\n",
+           __func__, offset, rw, vector, v->vcpu_id, v->processor);
+
+    switch(offset)
+    {
+    case APIC_ID:
+    case APIC_EOI:
+    case APIC_RRR:
+    case APIC_LDR:
+    case APIC_DFR:
+    case APIC_SPIV:
+    case APIC_ESR:
+    case APIC_ICR:
+    case APIC_LVTT:
+    case APIC_LVTTHMR:
+    case APIC_LVTPC:
+    case APIC_LVT0:
+    case APIC_LVT1:
+    case APIC_LVTERR:
+    case APIC_TMICT:
+    case APIC_TDCR:
+        /* Handling Trap */
+        if ( !rw )
+            /* Trap read should never happens */
+            BUG();
+        avic_unaccel_trap_write(v);
+        break;
+    default:
+        /* Handling Fault */
+        if ( !rw )
+            *avic_get_bk_page_entry(v, offset) = vlapic_read_aligned(
+                                                        vcpu_vlapic(v), offset);
+
+        hvm_mem_access_emulate_one(EMUL_KIND_NORMAL, TRAP_invalid_op,
+                                       HVM_DELIVER_NO_ERROR_CODE);
+    }
+
+    return;
+}
diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index bcb7df4..409096a 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -2710,6 +2710,14 @@  void svm_vmexit_handler(struct cpu_user_regs *regs)
         svm_vmexit_do_pause(regs);
         break;
 
+    case VMEXIT_AVIC_INCOMP_IPI:
+        svm_avic_vmexit_do_incomp_ipi(regs);
+        break;
+
+    case VMEXIT_AVIC_NOACCEL:
+        svm_avic_vmexit_do_noaccel(regs);
+        break;
+
     default:
     unexpected_exit_type:
         gdprintk(XENLOG_ERR, "unexpected VMEXIT: exit reason = %#"PRIx64", "
diff --git a/xen/include/asm-x86/hvm/svm/avic.h b/xen/include/asm-x86/hvm/svm/avic.h
index 9508486..2c501d4 100644
--- a/xen/include/asm-x86/hvm/svm/avic.h
+++ b/xen/include/asm-x86/hvm/svm/avic.h
@@ -37,4 +37,7 @@  bool_t svm_avic_vcpu_enabled(struct vcpu *v);
 void svm_avic_update_vapic_bar(struct vcpu *v,uint64_t data);
 int svm_avic_init_vmcb(struct vcpu *v);
 
+void svm_avic_vmexit_do_incomp_ipi(struct cpu_user_regs *regs);
+void svm_avic_vmexit_do_noaccel(struct cpu_user_regs *regs);
+
 #endif /* _SVM_AVIC_H_ */
diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h
index a42c034..23eb86b 100644
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -302,6 +302,8 @@  enum VMEXIT_EXITCODE
     VMEXIT_MWAIT_CONDITIONAL= 140, /* 0x8c */
     VMEXIT_XSETBV           = 141, /* 0x8d */
     VMEXIT_NPF              = 1024, /* 0x400, nested paging fault */
+    VMEXIT_AVIC_INCOMP_IPI  = 1025, /* 0x401 */
+    VMEXIT_AVIC_NOACCEL     = 1026, /* 0x402 */
     VMEXIT_INVALID          =  -1
 };