diff mbox series

[v7] x86/emulate: Send vm_event from emulate

Message ID 20190703105639.23081-1-aisaila@bitdefender.com (mailing list archive)
State Superseded
Headers show
Series [v7] x86/emulate: Send vm_event from emulate | expand

Commit Message

Alexandru Stefan ISAILA July 3, 2019, 10:56 a.m. UTC
A/D bit writes (on page walks) can be considered benign by an introspection
agent, so receiving vm_events for them is a pessimization. We try here to
optimize by fitering these events out.
Currently, we are fully emulating the instruction at RIP when the hardware sees
an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
incorrect, because the instruction at RIP might legitimately cause an
EPT fault of its own while accessing a _different_ page from the original one,
where A/D were set.
The solution is to perform the whole emulation, while ignoring EPT restrictions
for the walk part, and taking them into account for the "actual" emulating of
the instruction at RIP. When we send out a vm_event, we don't want the emulation
to complete, since in that case we won't be able to veto whatever it is doing.
That would mean that we can't actually prevent any malicious activity, instead
we'd only be able to report on it.
When we see a "send-vm_event" case while emulating, we need to first send the
event out and then stop the emulation (return X86EMUL_RETRY).
After the emulation stops we'll call hvm_vm_event_do_resume() again after the
introspection agent treats the event and resumes the guest. There, the
instruction at RIP will be fully emulated (with the EPT ignored) if the
introspection application allows it, and the guest will continue to run past
the instruction.

We use hvmemul_map_linear_addr() to intercept r/w access and
__hvm_copy() to intercept exec access.

hvm_emulate_send_vm_event() can return false if there was no violation,
if there was an error from monitor_traps() or p2m_get_mem_access().
Returning false if p2m_get_mem_access() is of because this will happen
if it was called with a bad address or if the entry was not found in the
EPT in which case it is unrestricted.

Signed-off-by: Alexandru Isaila <aisaila@bitdefender.com>

---
Changes since V6:
	- Add comment for hvm_emulate_send_vm_event()
	- Use gfn_to_gaddr()
	- Move send_event flag to arch.vm_event
	- Remove send_event param from hvm_emulate_send_vm_event()
	- Remove send_event and pfec check from
hvm_emulate_send_vm_event()
	- Cover all cases and remove default in switch ( access )
	- Move hvm_emulate_send_vm_event() call out of PFEC_write_access
	- Add send_event check before hvm_emulate_send_vm_event() call
	- Set vm_event->send_event flag before every
hvm_emulate_one_vm_event() call and clear it in
hvm_emulate_send_vm_event()
	- Dropped Paul's review.
---
 xen/arch/x86/hvm/emulate.c        | 78 ++++++++++++++++++++++++++++++-
 xen/arch/x86/hvm/hvm.c            |  8 ++++
 xen/arch/x86/hvm/vm_event.c       |  1 +
 xen/arch/x86/mm/mem_access.c      |  1 +
 xen/include/asm-x86/hvm/emulate.h |  4 ++
 xen/include/asm-x86/vm_event.h    |  2 +
 6 files changed, 93 insertions(+), 1 deletion(-)

Comments

Tamas K Lengyel July 11, 2019, 5:13 p.m. UTC | #1
> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>
>              ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>          }
> +
> +        if ( curr->arch.vm_event &&
> +            curr->arch.vm_event->send_event &&

Why not fold these checks into hvm_emulate_send_vm_event since..

> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
> +        {
> +            err = ERR_PTR(~X86EMUL_RETRY);
> +            goto out;
> +        }
>      }
>
>      /* Entire access within a single frame? */
> diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
> index 029eea3b85..783ebc3525 100644
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>              return HVMTRANS_bad_gfn_to_mfn;
>          }
>
> +        if ( unlikely(v->arch.vm_event) &&
> +            v->arch.vm_event->send_event &&

.. you seem to just repeat them here again?

> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
> +        {
> +            put_page(page);
> +            return HVMTRANS_gfn_paged_out;
> +        }
> +
>          p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK);
>
>          if ( flags & HVMCOPY_to_guest )
Jan Beulich July 12, 2019, 1:28 a.m. UTC | #2
On 11.07.2019 19:13, Tamas K Lengyel wrote:
>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>
>>               ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>           }
>> +
>> +        if ( curr->arch.vm_event &&
>> +            curr->arch.vm_event->send_event &&
> 
> Why not fold these checks into hvm_emulate_send_vm_event since..

I had asked for at least the first of the checks to be pulled
out of the function, for the common case to be affected as
little as possible.

>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>               return HVMTRANS_bad_gfn_to_mfn;
>>           }
>>
>> +        if ( unlikely(v->arch.vm_event) &&
>> +            v->arch.vm_event->send_event &&
> 
> .. you seem to just repeat them here again?

I agree that the duplication makes no sense.

Jan
Alexandru Stefan ISAILA July 15, 2019, 8:52 a.m. UTC | #3
On 12.07.2019 04:28, Jan Beulich wrote:
> On 11.07.2019 19:13, Tamas K Lengyel wrote:
>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>
>>>                ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>            }
>>> +
>>> +        if ( curr->arch.vm_event &&
>>> +            curr->arch.vm_event->send_event &&
>>
>> Why not fold these checks into hvm_emulate_send_vm_event since..
> 
> I had asked for at least the first of the checks to be pulled
> out of the function, for the common case to be affected as
> little as possible.
> 
>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>>                return HVMTRANS_bad_gfn_to_mfn;
>>>            }
>>>
>>> +        if ( unlikely(v->arch.vm_event) &&
>>> +            v->arch.vm_event->send_event &&
>>
>> .. you seem to just repeat them here again?
> 
> I agree that the duplication makes no sense.
> 

The first check is on the hvmemul_map_linear_addr() path and the second 
is on hvmemul_insn_fetch() path. There are 2 distinct ways to reach that 
if and therefore the check is not duplicated.

Thanks,
Alex
Jan Beulich July 18, 2019, 12:58 p.m. UTC | #4
On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
> A/D bit writes (on page walks) can be considered benign by an introspection
> agent, so receiving vm_events for them is a pessimization. We try here to
> optimize by fitering these events out.

But you add the sending of more events - how does "filter out" match
the actual implementation?

> Currently, we are fully emulating the instruction at RIP when the hardware sees
> an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
> incorrect, because the instruction at RIP might legitimately cause an
> EPT fault of its own while accessing a _different_ page from the original one,
> where A/D were set.
> The solution is to perform the whole emulation,

Above you said fully emulating such an insn is incorrect. To me the
two statements contradict one another.

> while ignoring EPT restrictions
> for the walk part, and taking them into account for the "actual" emulating of
> the instruction at RIP.

So the "ignore" part here is because the walk doesn't currently send
any events? That's an omission after all, which ultimately wants to
get fixed. This in turn makes me wonder whether there couldn't be
cases where a monitor actually wants to see these violations, too.
After all one may be able to abuse to page walker to set bits in
places you actually care to protect from undue modification.

> When we send out a vm_event, we don't want the emulation
> to complete, since in that case we won't be able to veto whatever it is doing.
> That would mean that we can't actually prevent any malicious activity, instead
> we'd only be able to report on it.
> When we see a "send-vm_event" case while emulating, we need to first send the
> event out and then stop the emulation (return X86EMUL_RETRY).

Perhaps better "suspend" instead of "stop"?

> After the emulation stops we'll call hvm_vm_event_do_resume() again after the
> introspection agent treats the event and resumes the guest. There, the
> instruction at RIP will be fully emulated (with the EPT ignored) if the
> introspection application allows it, and the guest will continue to run past
> the instruction.
> 
> We use hvmemul_map_linear_addr() to intercept r/w access and
> __hvm_copy() to intercept exec access.

Btw I continue to be unhappy about this asymmetry. Furthermore in
the former case you only handle write and rmw accesses, but not
reads afaics. I assume you don't care about reads, but this should
then be made explicit. Furthermore EPT allows read protection, and
there are p2m_access_w, p2m_access_wx, and p2m_access_x, so I guess
ignoring reads can at best be an option picked by the monitor, not
something to be left out of the interface altogether.

> hvm_emulate_send_vm_event() can return false if there was no violation,
> if there was an error from monitor_traps() or p2m_get_mem_access().

As said before - I don't think errors and lack of a violation can
sensibly be treated the same way. Is the implication perhaps that
emulation then will fail later anyway? If so, is such an
assumption taking into consideration possible races?

> Returning false if p2m_get_mem_access() is of because this will happen
> if it was called with a bad address or if the entry was not found in the
> EPT in which case it is unrestricted.

I'm afraid I'm having trouble understanding this. I'm in particular
heavily confused by the "of" in the middle.

> @@ -530,6 +532,71 @@ static int hvmemul_do_mmio_addr(paddr_t mmio_gpa,
>       return hvmemul_do_io_addr(1, mmio_gpa, reps, size, dir, df, ram_gpa);
>   }
>   
> +/*
> + * Send memory access vm_events based on pfec. Returns true if the event was
> + * sent and false for p2m_get_mem_access() error, no violation and event send
> + * error. Depends on arch.vm_event->send_event.

Instead of "depends", do you perhaps mean "assumes the caller to check"?
In which case you may want to ASSERT() this here to document the
requirement?

> + * NOTE: p2m_get_mem_access() can fail for wrong address or if the entry

What is "wrong address" here? IOW how is this different from "entry not
found"?

> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>   
>               ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>           }
> +
> +        if ( curr->arch.vm_event &&
> +            curr->arch.vm_event->send_event &&
> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )

Indentation looks off by one here.

> +        {
> +            err = ERR_PTR(~X86EMUL_RETRY);
> +            goto out;
> +        }

Did you notice that there's an immediate exit from the loop only
in case the linear -> physical translation fails? This is
relevant for page fault delivery correctness for accesses
crossing page boundaries. I think you want to use
update_map_err() and drop the "goto out". I can't really make up
my mind on the correct interaction between your new if() and the
one immediately ahead of it. You will want to think this through.

> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>               return HVMTRANS_bad_gfn_to_mfn;
>           }
>   
> +        if ( unlikely(v->arch.vm_event) &&
> +            v->arch.vm_event->send_event &&
> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )

Indentation looks wrong again.

> +        {
> +            put_page(page);
> +            return HVMTRANS_gfn_paged_out;

Why "paged out"? If this is an intentional abuse, then you want
to say so in a comment and justify the abuse here or in the
description.

> --- a/xen/arch/x86/hvm/vm_event.c
> +++ b/xen/arch/x86/hvm/vm_event.c
> @@ -86,6 +86,7 @@ void hvm_vm_event_do_resume(struct vcpu *v)
>                     VM_EVENT_FLAG_SET_EMUL_INSN_DATA )
>               kind = EMUL_KIND_SET_CONTEXT_INSN;
>   
> +        v->arch.vm_event->send_event = false;
>           hvm_emulate_one_vm_event(kind, TRAP_invalid_op,
>                                    X86_EVENT_NO_EC);

Is this insertion meaning to use "true" instead, or is the
revision log entry wrong? Or does "set" there not necessarily
mean "set it to true", but just "set it to a deterministic
value" (in which case "initialize" would have been an
unambiguous alternative wording)?

Jan
Alexandru Stefan ISAILA July 19, 2019, 12:34 p.m. UTC | #5
On 18.07.2019 15:58, Jan Beulich wrote:
> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>> A/D bit writes (on page walks) can be considered benign by an introspection
>> agent, so receiving vm_events for them is a pessimization. We try here to
>> optimize by fitering these events out.
> 
> But you add the sending of more events - how does "filter out" match
> the actual implementation?

The events are send only if there is a mem access violation therefore we 
are filtering and only sending the events that are interesting to 
introspection.

> 
>> Currently, we are fully emulating the instruction at RIP when the hardware sees
>> an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
>> incorrect, because the instruction at RIP might legitimately cause an
>> EPT fault of its own while accessing a _different_ page from the original one,
>> where A/D were set.
>> The solution is to perform the whole emulation,
> 
> Above you said fully emulating such an insn is incorrect. To me the
> two statements contradict one another.
> 
>> while ignoring EPT restrictions
>> for the walk part, and taking them into account for the "actual" emulating of
>> the instruction at RIP.
> 
> So the "ignore" part here is because the walk doesn't currently send
> any events? That's an omission after all, which ultimately wants to
> get fixed. This in turn makes me wonder whether there couldn't be
> cases where a monitor actually wants to see these violations, too.
> After all one may be able to abuse to page walker to set bits in
> places you actually care to protect from undue modification.

There is no need for events from page walk. Further work will have to be 
done, when page-walk will send events, so that we can toggle that new 
feature on/off.

After this patch the filtering + emulate + send event is controlled by
d->arch.monitor.inguest_pagefault_disabled.

> 
>> When we send out a vm_event, we don't want the emulation
>> to complete, since in that case we won't be able to veto whatever it is doing.
>> That would mean that we can't actually prevent any malicious activity, instead
>> we'd only be able to report on it.
>> When we see a "send-vm_event" case while emulating, we need to first send the
>> event out and then stop the emulation (return X86EMUL_RETRY).
> 
> Perhaps better "suspend" instead of "stop"?

I will change to suspend in then next version.

> 
>> After the emulation stops we'll call hvm_vm_event_do_resume() again after the
>> introspection agent treats the event and resumes the guest. There, the
>> instruction at RIP will be fully emulated (with the EPT ignored) if the
>> introspection application allows it, and the guest will continue to run past
>> the instruction.
>>
>> We use hvmemul_map_linear_addr() to intercept r/w access and
>> __hvm_copy() to intercept exec access.
> 
> Btw I continue to be unhappy about this asymmetry. Furthermore in
> the former case you only handle write and rmw accesses, but not
> reads afaics. I assume you don't care about reads, but this should
> then be made explicit. Furthermore EPT allows read protection, and
> there are p2m_access_w, p2m_access_wx, and p2m_access_x, so I guess
> ignoring reads can at best be an option picked by the monitor, not
> something to be left out of the interface altogether.

That is correct, we are not interested in read events but there is 
another problem, we are checking access and pfec to fill the event flag 
and pfec only has a write flag(PFEC_write_access), in __hvmemul_read() 
pfec only gets PFEC_page_present and there is no way to differentiate 
write from read.

> 
>> hvm_emulate_send_vm_event() can return false if there was no violation,
>> if there was an error from monitor_traps() or p2m_get_mem_access().
> 
> As said before - I don't think errors and lack of a violation can
> sensibly be treated the same way. Is the implication perhaps that
> emulation then will fail later anyway? If so, is such an
> assumption taking into consideration possible races?

The only place that I can see a problem is the error from 
monitor_traps(). That can be checked and accompanied by a warning msg.
Or if you can give me a different idea to go forward with this issue I 
will be glad to review it.

> 
>> Returning false if p2m_get_mem_access() is of because this will happen
>> if it was called with a bad address or if the entry was not found in the
>> EPT in which case it is unrestricted.
> 
> I'm afraid I'm having trouble understanding this. I'm in particular
> heavily confused by the "of" in the middle.

Sorry about the typo, it should be like

"Returning false if p2m_get_mem_access() fails is because the call was 
made with a bad address or if the entry was not found in the
EPT in which case it is unrestricted."

In any case the fail form p2m_get_mem_access() will translate in no 
violation.

> 
>> @@ -530,6 +532,71 @@ static int hvmemul_do_mmio_addr(paddr_t mmio_gpa,
>>        return hvmemul_do_io_addr(1, mmio_gpa, reps, size, dir, df, ram_gpa);
>>    }
>>    
>> +/*
>> + * Send memory access vm_events based on pfec. Returns true if the event was
>> + * sent and false for p2m_get_mem_access() error, no violation and event send
>> + * error. Depends on arch.vm_event->send_event.
> 
> Instead of "depends", do you perhaps mean "assumes the caller to check"?
> In which case you may want to ASSERT() this here to document the
> requirement?
> 
>> + * NOTE: p2m_get_mem_access() can fail for wrong address or if the entry
> 
> What is "wrong address" here? IOW how is this different from "entry not
> found"?
> 
>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>    
>>                ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>            }
>> +
>> +        if ( curr->arch.vm_event &&
>> +            curr->arch.vm_event->send_event &&
>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
> 
> Indentation looks off by one here.

I saw that just now, I will add the missing spaces here and in the next 
place.

> 
>> +        {
>> +            err = ERR_PTR(~X86EMUL_RETRY);
>> +            goto out;
>> +        }
> 
> Did you notice that there's an immediate exit from the loop only
> in case the linear -> physical translation fails? This is
> relevant for page fault delivery correctness for accesses
> crossing page boundaries. I think you want to use
> update_map_err() and drop the "goto out". I can't really make up

By update_map_err() are you saying to have the err var assigned and then 
drop "goto out"? If so how do I keep the err from my access violation 
without exiting from the loop?

> my mind on the correct interaction between your new if() and the
> one immediately ahead of it. You will want to think this through.
> 
>> --- a/xen/arch/x86/hvm/hvm.c
>> +++ b/xen/arch/x86/hvm/hvm.c
>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>                return HVMTRANS_bad_gfn_to_mfn;
>>            }
>>    
>> +        if ( unlikely(v->arch.vm_event) &&
>> +            v->arch.vm_event->send_event &&
>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
> 
> Indentation looks wrong again.
> 
>> +        {
>> +            put_page(page);
>> +            return HVMTRANS_gfn_paged_out;
> 
> Why "paged out"? If this is an intentional abuse, then you want
> to say so in a comment and justify the abuse here or in the
> description.

This is done to clean the paged in before the return.

> 
>> --- a/xen/arch/x86/hvm/vm_event.c
>> +++ b/xen/arch/x86/hvm/vm_event.c
>> @@ -86,6 +86,7 @@ void hvm_vm_event_do_resume(struct vcpu *v)
>>                      VM_EVENT_FLAG_SET_EMUL_INSN_DATA )
>>                kind = EMUL_KIND_SET_CONTEXT_INSN;
>>    
>> +        v->arch.vm_event->send_event = false;
>>            hvm_emulate_one_vm_event(kind, TRAP_invalid_op,
>>                                     X86_EVENT_NO_EC);
> 
> Is this insertion meaning to use "true" instead, or is the
> revision log entry wrong? Or does "set" there not necessarily
> mean "set it to true", but just "set it to a deterministic
> value" (in which case "initialize" would have been an
> unambiguous alternative wording)?

This means to use "true" and send vm_event if there is a need to do so 
in the emulation.



Thanks for the comments,
Alex
Jan Beulich July 19, 2019, 1:18 p.m. UTC | #6
On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
> On 18.07.2019 15:58, Jan Beulich wrote:
>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>> A/D bit writes (on page walks) can be considered benign by an introspection
>>> agent, so receiving vm_events for them is a pessimization. We try here to
>>> optimize by fitering these events out.
>>
>> But you add the sending of more events - how does "filter out" match
>> the actual implementation?
> 
> The events are send only if there is a mem access violation therefore we
> are filtering and only sending the events that are interesting to
> introspection.

Where is it that you prevent any event from being sent? As said,
reading the patch I only see new sending sites to get added.

>>> Currently, we are fully emulating the instruction at RIP when the hardware sees
>>> an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
>>> incorrect, because the instruction at RIP might legitimately cause an
>>> EPT fault of its own while accessing a _different_ page from the original one,
>>> where A/D were set.
>>> The solution is to perform the whole emulation,
>>
>> Above you said fully emulating such an insn is incorrect. To me the
>> two statements contradict one another.
>>
>>> while ignoring EPT restrictions
>>> for the walk part, and taking them into account for the "actual" emulating of
>>> the instruction at RIP.
>>
>> So the "ignore" part here is because the walk doesn't currently send
>> any events? That's an omission after all, which ultimately wants to
>> get fixed. This in turn makes me wonder whether there couldn't be
>> cases where a monitor actually wants to see these violations, too.
>> After all one may be able to abuse to page walker to set bits in
>> places you actually care to protect from undue modification.
> 
> There is no need for events from page walk. Further work will have to be
> done, when page-walk will send events, so that we can toggle that new
> feature on/off.

Please can you move over to thinking in more general terms,
not just what you need for your application. In this case
"There is no need" != "We don't have a need for". And I think
the VM event _interface_ should be arranged in a way that it
already accounts for eventually correct behavior of the page
walk paths.

>>> After the emulation stops we'll call hvm_vm_event_do_resume() again after the
>>> introspection agent treats the event and resumes the guest. There, the
>>> instruction at RIP will be fully emulated (with the EPT ignored) if the
>>> introspection application allows it, and the guest will continue to run past
>>> the instruction.
>>>
>>> We use hvmemul_map_linear_addr() to intercept r/w access and
>>> __hvm_copy() to intercept exec access.
>>
>> Btw I continue to be unhappy about this asymmetry. Furthermore in
>> the former case you only handle write and rmw accesses, but not
>> reads afaics. I assume you don't care about reads, but this should
>> then be made explicit. Furthermore EPT allows read protection, and
>> there are p2m_access_w, p2m_access_wx, and p2m_access_x, so I guess
>> ignoring reads can at best be an option picked by the monitor, not
>> something to be left out of the interface altogether.
> 
> That is correct, we are not interested in read events but there is
> another problem, we are checking access and pfec to fill the event flag
> and pfec only has a write flag(PFEC_write_access), in __hvmemul_read()
> pfec only gets PFEC_page_present and there is no way to differentiate
> write from read.

By the PFEC model, anything that's not a write or insn fetch is a
read. The main anomaly is elsewhere: The write flag is also going
to be set for RMW operations.

>>> hvm_emulate_send_vm_event() can return false if there was no violation,
>>> if there was an error from monitor_traps() or p2m_get_mem_access().
>>
>> As said before - I don't think errors and lack of a violation can
>> sensibly be treated the same way. Is the implication perhaps that
>> emulation then will fail later anyway? If so, is such an
>> assumption taking into consideration possible races?
> 
> The only place that I can see a problem is the error from
> monitor_traps(). That can be checked and accompanied by a warning msg.

How would a warning message help?

> Or if you can give me a different idea to go forward with this issue I
> will be glad to review it.

I'm afraid you'll have to first of all give me an idea what the
correct action is in case of such an error. And once you've done
so, I'm pretty sure you'll recognize yourself whether the current
code you have is appropriate (and I'll then know whether I want
to insist on you changing the code).

>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>     
>>>                 ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>             }
>>> +
>>> +        if ( curr->arch.vm_event &&
>>> +            curr->arch.vm_event->send_event &&
>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>> +        {
>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>> +            goto out;
>>> +        }
>>
>> Did you notice that there's an immediate exit from the loop only
>> in case the linear -> physical translation fails? This is
>> relevant for page fault delivery correctness for accesses
>> crossing page boundaries. I think you want to use
>> update_map_err() and drop the "goto out". I can't really make up
> 
> By update_map_err() are you saying to have the err var assigned and then
> drop "goto out"? If so how do I keep the err from my access violation
> without exiting from the loop?

Counter question: Why do you _need_ to keep "your" value of err?
If, just as an example, there's going to be a #PF on the other
half of the access, then "your" access violation is of no interest
at all.

>>> --- a/xen/arch/x86/hvm/hvm.c
>>> +++ b/xen/arch/x86/hvm/hvm.c
>>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>>                 return HVMTRANS_bad_gfn_to_mfn;
>>>             }
>>>     
>>> +        if ( unlikely(v->arch.vm_event) &&
>>> +            v->arch.vm_event->send_event &&
>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>
>> Indentation looks wrong again.
>>
>>> +        {
>>> +            put_page(page);
>>> +            return HVMTRANS_gfn_paged_out;
>>
>> Why "paged out"? If this is an intentional abuse, then you want
>> to say so in a comment and justify the abuse here or in the
>> description.
> 
> This is done to clean the paged in before the return.

What is "the paged in" here?

>>> --- a/xen/arch/x86/hvm/vm_event.c
>>> +++ b/xen/arch/x86/hvm/vm_event.c
>>> @@ -86,6 +86,7 @@ void hvm_vm_event_do_resume(struct vcpu *v)
>>>                       VM_EVENT_FLAG_SET_EMUL_INSN_DATA )
>>>                 kind = EMUL_KIND_SET_CONTEXT_INSN;
>>>     
>>> +        v->arch.vm_event->send_event = false;
>>>             hvm_emulate_one_vm_event(kind, TRAP_invalid_op,
>>>                                      X86_EVENT_NO_EC);
>>
>> Is this insertion meaning to use "true" instead, or is the
>> revision log entry wrong? Or does "set" there not necessarily
>> mean "set it to true", but just "set it to a deterministic
>> value" (in which case "initialize" would have been an
>> unambiguous alternative wording)?
> 
> This means to use "true" and send vm_event if there is a need to do so
> in the emulation.

??? (There's very clearly "false" being written above.)

Jan
Razvan Cojocaru July 19, 2019, 1:30 p.m. UTC | #7
On 7/19/19 4:18 PM, Jan Beulich wrote:
> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>> On 18.07.2019 15:58, Jan Beulich wrote:
>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>> A/D bit writes (on page walks) can be considered benign by an introspection
>>>> agent, so receiving vm_events for them is a pessimization. We try here to
>>>> optimize by fitering these events out.
>>>
>>> But you add the sending of more events - how does "filter out" match
>>> the actual implementation?
>>
>> The events are send only if there is a mem access violation therefore we
>> are filtering and only sending the events that are interesting to
>> introspection.
> 
> Where is it that you prevent any event from being sent? As said,
> reading the patch I only see new sending sites to get added.

If we don't emulate, we would receive the page-walk-generated events 
_and_ the touching-the-page-the-instruction-is-touching events.

In comparison to the "hardware" case, the case where we emulate the 
instruction with the page walk ignoring the EPT results in less events, 
hence the prevention of some events being sent out.


Thanks,
Razvan
Jan Beulich July 19, 2019, 1:38 p.m. UTC | #8
On 19.07.2019 15:30, Razvan Cojocaru wrote:
> On 7/19/19 4:18 PM, Jan Beulich wrote:
>> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>>> On 18.07.2019 15:58, Jan Beulich wrote:
>>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>>> A/D bit writes (on page walks) can be considered benign by an introspection
>>>>> agent, so receiving vm_events for them is a pessimization. We try here to
>>>>> optimize by fitering these events out.
>>>>
>>>> But you add the sending of more events - how does "filter out" match
>>>> the actual implementation?
>>>
>>> The events are send only if there is a mem access violation therefore we
>>> are filtering and only sending the events that are interesting to
>>> introspection.
>>
>> Where is it that you prevent any event from being sent? As said,
>> reading the patch I only see new sending sites to get added.
> 
> If we don't emulate, we would receive the page-walk-generated events
> _and_ the touching-the-page-the-instruction-is-touching events.

Since the patch here alters emulation paths only, how do you know
whether to emulate? In order to not receive undue events it would
seem to me that you'd first have to intercept the guest on insns
of interest ... Overall I think that the patch description, while
it has improved, is still lacking sufficient information for a
person like me (not knowing much about your monitor tools) to be
able to sensibly review this (which includes understanding the
precise scenario you want to improve).

Jan
Razvan Cojocaru July 19, 2019, 2:23 p.m. UTC | #9
On 7/19/19 4:38 PM, Jan Beulich wrote:
> On 19.07.2019 15:30, Razvan Cojocaru wrote:
>> On 7/19/19 4:18 PM, Jan Beulich wrote:
>>> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>>>> On 18.07.2019 15:58, Jan Beulich wrote:
>>>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>>>> A/D bit writes (on page walks) can be considered benign by an introspection
>>>>>> agent, so receiving vm_events for them is a pessimization. We try here to
>>>>>> optimize by fitering these events out.
>>>>>
>>>>> But you add the sending of more events - how does "filter out" match
>>>>> the actual implementation?
>>>>
>>>> The events are send only if there is a mem access violation therefore we
>>>> are filtering and only sending the events that are interesting to
>>>> introspection.
>>>
>>> Where is it that you prevent any event from being sent? As said,
>>> reading the patch I only see new sending sites to get added.
>>
>> If we don't emulate, we would receive the page-walk-generated events
>> _and_ the touching-the-page-the-instruction-is-touching events.
> 
> Since the patch here alters emulation paths only, how do you know
> whether to emulate? In order to not receive undue events it would
> seem to me that you'd first have to intercept the guest on insns
> of interest ... Overall I think that the patch description, while
> it has improved, is still lacking sufficient information for a
> person like me (not knowing much about your monitor tools) to be
> able to sensibly review this (which includes understanding the
> precise scenario you want to improve).

If the hardware exits because of an EPT fault caused by a page walk, we 
end up in p2m_mem_access_check(), at which point we need to decide if we 
want to send out a vm_event or not.

If we were to send out this vm_event, and it would then be magically 
treated so that we get to actually run the instruction at RIP, said 
instruction might also hit a protected page and provoke a vm_event.

Now, if npfec.kind != npfec_kind_with_gla, then we're in the page walk 
case, and so in this case only, and only if 
d->arch.monitor.inguest_pagefault_disabled is true, we would choose to 
do this emulation trick: emulate _the_page_walk_ while ignoring the EPT, 
but don't ignore the EPT for the emulation of the actual instruction.

So where in the first case we would have 2 EPT events, in the second we 
only have one (or if the instruction at RIP does not trigger an EPT 
event, we would have 1 event in the first case, and none in the second).
Hence the filtering mentioned.

So to answer your question: "how do you know whether to emulate", we do 
so only if npfec.kind != npfec_kind_with_gla && 
d->arch.monitor.inguest_pagefault_disabled.

I hope this clears it up somewhat.


Thanks,
Razvan
Alexandru Stefan ISAILA July 22, 2019, 7:51 a.m. UTC | #10
On 19.07.2019 16:18, Jan Beulich wrote:
> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>> On 18.07.2019 15:58, Jan Beulich wrote:
>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>> A/D bit writes (on page walks) can be considered benign by an introspection
>>>> agent, so receiving vm_events for them is a pessimization. We try here to
>>>> optimize by fitering these events out.
>>>
>>> But you add the sending of more events - how does "filter out" match
>>> the actual implementation?
>>
>> The events are send only if there is a mem access violation therefore we
>> are filtering and only sending the events that are interesting to
>> introspection.
> 
> Where is it that you prevent any event from being sent? As said,
> reading the patch I only see new sending sites to get added.
> 
>>>> Currently, we are fully emulating the instruction at RIP when the hardware sees
>>>> an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
>>>> incorrect, because the instruction at RIP might legitimately cause an
>>>> EPT fault of its own while accessing a _different_ page from the original one,
>>>> where A/D were set.
>>>> The solution is to perform the whole emulation,
>>>
>>> Above you said fully emulating such an insn is incorrect. To me the
>>> two statements contradict one another.
>>>
>>>> while ignoring EPT restrictions
>>>> for the walk part, and taking them into account for the "actual" emulating of
>>>> the instruction at RIP.
>>>
>>> So the "ignore" part here is because the walk doesn't currently send
>>> any events? That's an omission after all, which ultimately wants to
>>> get fixed. This in turn makes me wonder whether there couldn't be
>>> cases where a monitor actually wants to see these violations, too.
>>> After all one may be able to abuse to page walker to set bits in
>>> places you actually care to protect from undue modification.
>>
>> There is no need for events from page walk. Further work will have to be
>> done, when page-walk will send events, so that we can toggle that new
>> feature on/off.
> 
> Please can you move over to thinking in more general terms,
> not just what you need for your application. In this case
> "There is no need" != "We don't have a need for". And I think
> the VM event _interface_ should be arranged in a way that it
> already accounts for eventually correct behavior of the page
> walk paths.
> 
>>>> After the emulation stops we'll call hvm_vm_event_do_resume() again after the
>>>> introspection agent treats the event and resumes the guest. There, the
>>>> instruction at RIP will be fully emulated (with the EPT ignored) if the
>>>> introspection application allows it, and the guest will continue to run past
>>>> the instruction.
>>>>
>>>> We use hvmemul_map_linear_addr() to intercept r/w access and
>>>> __hvm_copy() to intercept exec access.
>>>
>>> Btw I continue to be unhappy about this asymmetry. Furthermore in
>>> the former case you only handle write and rmw accesses, but not
>>> reads afaics. I assume you don't care about reads, but this should
>>> then be made explicit. Furthermore EPT allows read protection, and
>>> there are p2m_access_w, p2m_access_wx, and p2m_access_x, so I guess
>>> ignoring reads can at best be an option picked by the monitor, not
>>> something to be left out of the interface altogether.
>>
>> That is correct, we are not interested in read events but there is
>> another problem, we are checking access and pfec to fill the event flag
>> and pfec only has a write flag(PFEC_write_access), in __hvmemul_read()
>> pfec only gets PFEC_page_present and there is no way to differentiate
>> write from read.
> 
> By the PFEC model, anything that's not a write or insn fetch is a
> read. The main anomaly is elsewhere: The write flag is also going
> to be set for RMW operations.
> 
>>>> hvm_emulate_send_vm_event() can return false if there was no violation,
>>>> if there was an error from monitor_traps() or p2m_get_mem_access().
>>>
>>> As said before - I don't think errors and lack of a violation can
>>> sensibly be treated the same way. Is the implication perhaps that
>>> emulation then will fail later anyway? If so, is such an
>>> assumption taking into consideration possible races?
>>
>> The only place that I can see a problem is the error from
>> monitor_traps(). That can be checked and accompanied by a warning msg.
> 
> How would a warning message help?
> 
>> Or if you can give me a different idea to go forward with this issue I
>> will be glad to review it.
> 
> I'm afraid you'll have to first of all give me an idea what the
> correct action is in case of such an error. And once you've done
> so, I'm pretty sure you'll recognize yourself whether the current
> code you have is appropriate (and I'll then know whether I want
> to insist on you changing the code).
> 
>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>      
>>>>                  ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>              }
>>>> +
>>>> +        if ( curr->arch.vm_event &&
>>>> +            curr->arch.vm_event->send_event &&
>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>> +        {
>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>> +            goto out;
>>>> +        }
>>>
>>> Did you notice that there's an immediate exit from the loop only
>>> in case the linear -> physical translation fails? This is
>>> relevant for page fault delivery correctness for accesses
>>> crossing page boundaries. I think you want to use
>>> update_map_err() and drop the "goto out". I can't really make up
>>
>> By update_map_err() are you saying to have the err var assigned and then
>> drop "goto out"? If so how do I keep the err from my access violation
>> without exiting from the loop?
> 
> Counter question: Why do you _need_ to keep "your" value of err?
> If, just as an example, there's going to be a #PF on the other
> half of the access, then "your" access violation is of no interest
> at all.

You are right, there is no need to keep the "goto out" here. It was just 
for optimization in the idea that there is no need to do further steps 
but I can drop the "goto out" and the code will work the same.

> 
>>>> --- a/xen/arch/x86/hvm/hvm.c
>>>> +++ b/xen/arch/x86/hvm/hvm.c
>>>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>>>                  return HVMTRANS_bad_gfn_to_mfn;
>>>>              }
>>>>      
>>>> +        if ( unlikely(v->arch.vm_event) &&
>>>> +            v->arch.vm_event->send_event &&
>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>
>>> Indentation looks wrong again.
>>>
>>>> +        {
>>>> +            put_page(page);
>>>> +            return HVMTRANS_gfn_paged_out;
>>>
>>> Why "paged out"? If this is an intentional abuse, then you want
>>> to say so in a comment and justify the abuse here or in the
>>> description.
>>
>> This is done to clean the paged in before the return.
> 
> What is "the paged in" here?
> 
>>>> --- a/xen/arch/x86/hvm/vm_event.c
>>>> +++ b/xen/arch/x86/hvm/vm_event.c
>>>> @@ -86,6 +86,7 @@ void hvm_vm_event_do_resume(struct vcpu *v)
>>>>                        VM_EVENT_FLAG_SET_EMUL_INSN_DATA )
>>>>                  kind = EMUL_KIND_SET_CONTEXT_INSN;
>>>>      
>>>> +        v->arch.vm_event->send_event = false;
>>>>              hvm_emulate_one_vm_event(kind, TRAP_invalid_op,
>>>>                                       X86_EVENT_NO_EC);
>>>
>>> Is this insertion meaning to use "true" instead, or is the
>>> revision log entry wrong? Or does "set" there not necessarily
>>> mean "set it to true", but just "set it to a deterministic
>>> value" (in which case "initialize" would have been an
>>> unambiguous alternative wording)?
>>
>> This means to use "true" and send vm_event if there is a need to do so
>> in the emulation.
> 
> ??? (There's very clearly "false" being written above.)

Sorry, my bad, the false there was kept from early versions, There is no 
need to have it here because the it is set to false in 
hvm_emulate_send_vm_event().


Alex
Alexandru Stefan ISAILA July 23, 2019, 8:17 a.m. UTC | #11
On 19.07.2019 16:18, Jan Beulich wrote:
> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>> On 18.07.2019 15:58, Jan Beulich wrote:
>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:

>>>> Currently, we are fully emulating the instruction at RIP when the hardware sees
>>>> an EPT fault with npfec.kind != npfec_kind_with_gla. This is, however,
>>>> incorrect, because the instruction at RIP might legitimately cause an
>>>> EPT fault of its own while accessing a _different_ page from the original one,
>>>> where A/D were set.
>>>> The solution is to perform the whole emulation,
>>>
>>> Above you said fully emulating such an insn is incorrect. To me the
>>> two statements contradict one another.
>>>
>>>> while ignoring EPT restrictions
>>>> for the walk part, and taking them into account for the "actual" emulating of
>>>> the instruction at RIP.
>>>
>>> So the "ignore" part here is because the walk doesn't currently send
>>> any events? That's an omission after all, which ultimately wants to
>>> get fixed. This in turn makes me wonder whether there couldn't be
>>> cases where a monitor actually wants to see these violations, too.
>>> After all one may be able to abuse to page walker to set bits in
>>> places you actually care to protect from undue modification.
>>
>> There is no need for events from page walk. Further work will have to be
>> done, when page-walk will send events, so that we can toggle that new
>> feature on/off.
> 
> Please can you move over to thinking in more general terms,
> not just what you need for your application. In this case
> "There is no need" != "We don't have a need for". And I think
> the VM event _interface_ should be arranged in a way that it
> already accounts for eventually correct behavior of the page
> walk paths.
> 

I'm not sure how future code for sending events form page-walk will be 
but I will try to make this patch have some checks in place so that it 
will work the same.

>>>> After the emulation stops we'll call hvm_vm_event_do_resume() again after the
>>>> introspection agent treats the event and resumes the guest. There, the
>>>> instruction at RIP will be fully emulated (with the EPT ignored) if the
>>>> introspection application allows it, and the guest will continue to run past
>>>> the instruction.
>>>>
>>>> We use hvmemul_map_linear_addr() to intercept r/w access and
>>>> __hvm_copy() to intercept exec access.
>>>
>>> Btw I continue to be unhappy about this asymmetry. Furthermore in
>>> the former case you only handle write and rmw accesses, but not
>>> reads afaics. I assume you don't care about reads, but this should
>>> then be made explicit. Furthermore EPT allows read protection, and
>>> there are p2m_access_w, p2m_access_wx, and p2m_access_x, so I guess
>>> ignoring reads can at best be an option picked by the monitor, not
>>> something to be left out of the interface altogether.
>>
>> That is correct, we are not interested in read events but there is
>> another problem, we are checking access and pfec to fill the event flag
>> and pfec only has a write flag(PFEC_write_access), in __hvmemul_read()
>> pfec only gets PFEC_page_present and there is no way to differentiate
>> write from read.
> 
> By the PFEC model, anything that's not a write or insn fetch is a
> read. The main anomaly is elsewhere: The write flag is also going
> to be set for RMW operations.
> 
>>>> hvm_emulate_send_vm_event() can return false if there was no violation,
>>>> if there was an error from monitor_traps() or p2m_get_mem_access().
>>>
>>> As said before - I don't think errors and lack of a violation can
>>> sensibly be treated the same way. Is the implication perhaps that
>>> emulation then will fail later anyway? If so, is such an
>>> assumption taking into consideration possible races?
>>
>> The only place that I can see a problem is the error from
>> monitor_traps(). That can be checked and accompanied by a warning msg.
> 
> How would a warning message help?
> 
>> Or if you can give me a different idea to go forward with this issue I
>> will be glad to review it.
> 
> I'm afraid you'll have to first of all give me an idea what the
> correct action is in case of such an error. And once you've done
> so, I'm pretty sure you'll recognize yourself whether the current
> code you have is appropriate (and I'll then know whether I want
> to insist on you changing the code).
> 

So I think that the return of hvm_emulate_send_vm_event() should not use 
the return of monitor_traps(). By the time monitor_traps() is called we 
are sure that there is a violation and emulation should stop regardless 
if the event was sent or not. In this idea the last return should be true.

>>>> --- a/xen/arch/x86/hvm/hvm.c
>>>> +++ b/xen/arch/x86/hvm/hvm.c
>>>> @@ -3224,6 +3224,14 @@ static enum hvm_translation_result __hvm_copy(
>>>>                  return HVMTRANS_bad_gfn_to_mfn;
>>>>              }
>>>>      
>>>> +        if ( unlikely(v->arch.vm_event) &&
>>>> +            v->arch.vm_event->send_event &&
>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>
>>> Indentation looks wrong again.
>>>
>>>> +        {
>>>> +            put_page(page);
>>>> +            return HVMTRANS_gfn_paged_out;
>>>
>>> Why "paged out"? If this is an intentional abuse, then you want
>>> to say so in a comment and justify the abuse here or in the
>>> description.

Yes this is intentional so linear_read() will return X86EMUL_RETRY.

Thanks for the review,
Alex
Alexandru Stefan ISAILA July 29, 2019, 8:12 a.m. UTC | #12
On 19.07.2019 17:23, Razvan Cojocaru wrote:
> On 7/19/19 4:38 PM, Jan Beulich wrote:
>> On 19.07.2019 15:30, Razvan Cojocaru wrote:
>>> On 7/19/19 4:18 PM, Jan Beulich wrote:
>>>> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>>>>> On 18.07.2019 15:58, Jan Beulich wrote:
>>>>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>>>>> A/D bit writes (on page walks) can be considered benign by an 
>>>>>>> introspection
>>>>>>> agent, so receiving vm_events for them is a pessimization. We try 
>>>>>>> here to
>>>>>>> optimize by fitering these events out.
>>>>>>
>>>>>> But you add the sending of more events - how does "filter out" match
>>>>>> the actual implementation?
>>>>>
>>>>> The events are send only if there is a mem access violation 
>>>>> therefore we
>>>>> are filtering and only sending the events that are interesting to
>>>>> introspection.
>>>>
>>>> Where is it that you prevent any event from being sent? As said,
>>>> reading the patch I only see new sending sites to get added.
>>>
>>> If we don't emulate, we would receive the page-walk-generated events
>>> _and_ the touching-the-page-the-instruction-is-touching events.
>>
>> Since the patch here alters emulation paths only, how do you know
>> whether to emulate? In order to not receive undue events it would
>> seem to me that you'd first have to intercept the guest on insns
>> of interest ... Overall I think that the patch description, while
>> it has improved, is still lacking sufficient information for a
>> person like me (not knowing much about your monitor tools) to be
>> able to sensibly review this (which includes understanding the
>> precise scenario you want to improve).
> 
> If the hardware exits because of an EPT fault caused by a page walk, we 
> end up in p2m_mem_access_check(), at which point we need to decide if we 
> want to send out a vm_event or not.
> 
> If we were to send out this vm_event, and it would then be magically 
> treated so that we get to actually run the instruction at RIP, said 
> instruction might also hit a protected page and provoke a vm_event.
> 
> Now, if npfec.kind != npfec_kind_with_gla, then we're in the page walk 
> case, and so in this case only, and only if 
> d->arch.monitor.inguest_pagefault_disabled is true, we would choose to 
> do this emulation trick: emulate _the_page_walk_ while ignoring the EPT, 
> but don't ignore the EPT for the emulation of the actual instruction.
> 
> So where in the first case we would have 2 EPT events, in the second we 
> only have one (or if the instruction at RIP does not trigger an EPT 
> event, we would have 1 event in the first case, and none in the second).
> Hence the filtering mentioned.
> 
> So to answer your question: "how do you know whether to emulate", we do 
> so only if npfec.kind != npfec_kind_with_gla && 
> d->arch.monitor.inguest_pagefault_disabled.
> 
> I hope this clears it up somewhat.
> 

To summarize the changes needed for the next version, apart from the 
code changes, is the description good or do I have to add something else?

Thanks,
Alex
Jan Beulich July 29, 2019, 11:30 a.m. UTC | #13
On 29.07.2019 10:12, Alexandru Stefan ISAILA wrote:
> 
> 
> On 19.07.2019 17:23, Razvan Cojocaru wrote:
>> On 7/19/19 4:38 PM, Jan Beulich wrote:
>>> On 19.07.2019 15:30, Razvan Cojocaru wrote:
>>>> On 7/19/19 4:18 PM, Jan Beulich wrote:
>>>>> On 19.07.2019 14:34, Alexandru Stefan ISAILA wrote:
>>>>>> On 18.07.2019 15:58, Jan Beulich wrote:
>>>>>>> On 03.07.2019 12:56, Alexandru Stefan ISAILA wrote:
>>>>>>>> A/D bit writes (on page walks) can be considered benign by an
>>>>>>>> introspection
>>>>>>>> agent, so receiving vm_events for them is a pessimization. We try
>>>>>>>> here to
>>>>>>>> optimize by fitering these events out.
>>>>>>>
>>>>>>> But you add the sending of more events - how does "filter out" match
>>>>>>> the actual implementation?
>>>>>>
>>>>>> The events are send only if there is a mem access violation
>>>>>> therefore we
>>>>>> are filtering and only sending the events that are interesting to
>>>>>> introspection.
>>>>>
>>>>> Where is it that you prevent any event from being sent? As said,
>>>>> reading the patch I only see new sending sites to get added.
>>>>
>>>> If we don't emulate, we would receive the page-walk-generated events
>>>> _and_ the touching-the-page-the-instruction-is-touching events.
>>>
>>> Since the patch here alters emulation paths only, how do you know
>>> whether to emulate? In order to not receive undue events it would
>>> seem to me that you'd first have to intercept the guest on insns
>>> of interest ... Overall I think that the patch description, while
>>> it has improved, is still lacking sufficient information for a
>>> person like me (not knowing much about your monitor tools) to be
>>> able to sensibly review this (which includes understanding the
>>> precise scenario you want to improve).
>>
>> If the hardware exits because of an EPT fault caused by a page walk, we
>> end up in p2m_mem_access_check(), at which point we need to decide if we
>> want to send out a vm_event or not.
>>
>> If we were to send out this vm_event, and it would then be magically
>> treated so that we get to actually run the instruction at RIP, said
>> instruction might also hit a protected page and provoke a vm_event.
>>
>> Now, if npfec.kind != npfec_kind_with_gla, then we're in the page walk
>> case, and so in this case only, and only if
>> d->arch.monitor.inguest_pagefault_disabled is true, we would choose to
>> do this emulation trick: emulate _the_page_walk_ while ignoring the EPT,
>> but don't ignore the EPT for the emulation of the actual instruction.
>>
>> So where in the first case we would have 2 EPT events, in the second we
>> only have one (or if the instruction at RIP does not trigger an EPT
>> event, we would have 1 event in the first case, and none in the second).
>> Hence the filtering mentioned.
>>
>> So to answer your question: "how do you know whether to emulate", we do
>> so only if npfec.kind != npfec_kind_with_gla &&
>> d->arch.monitor.inguest_pagefault_disabled.
>>
>> I hope this clears it up somewhat.
>>
> 
> To summarize the changes needed for the next version, apart from the
> code changes, is the description good or do I have to add something else?

As said in a prior reply, Razvan's explanation has helped. I don't think
though that it's suitable as a patch description without some adjustments.
I further seem to recall that I had asked for a concrete example to be
laid out in the description, highlighting what exactly in the overall flow
your patch means to change.

Jan
Alexandru Stefan ISAILA July 30, 2019, 12:21 p.m. UTC | #14
>>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>>       
>>>>>                   ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>>               }
>>>>> +
>>>>> +        if ( curr->arch.vm_event &&
>>>>> +            curr->arch.vm_event->send_event &&
>>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>>> +        {
>>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>>> +            goto out;
>>>>> +        }
>>>>
>>>> Did you notice that there's an immediate exit from the loop only
>>>> in case the linear -> physical translation fails? This is
>>>> relevant for page fault delivery correctness for accesses
>>>> crossing page boundaries. I think you want to use
>>>> update_map_err() and drop the "goto out". I can't really make up
>>>
>>> By update_map_err() are you saying to have the err var assigned and then
>>> drop "goto out"? If so how do I keep the err from my access violation
>>> without exiting from the loop?
>>
>> Counter question: Why do you _need_ to keep "your" value of err?
>> If, just as an example, there's going to be a #PF on the other
>> half of the access, then "your" access violation is of no interest
>> at all.
> 
> You are right, there is no need to keep the "goto out" here. It was just
> for optimization in the idea that there is no need to do further steps
> but I can drop the "goto out" and the code will work the same.
> 

There is a problem with dropping the "goto out". If everything goes fine 
then it will return the mapping and I don't want that. This can be 
stopped by checking if ( err ) after the loop and it is not null then 
goto out. And going with this idea I can init *err = NULL and drop the 
err = NULL from hvmemul_map_linear_addr(). Is this ok for the next version?

Regards,
Alex
Jan Beulich July 30, 2019, 1:27 p.m. UTC | #15
On 30.07.2019 14:21, Alexandru Stefan ISAILA wrote:
> 
>>>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>>>        
>>>>>>                    ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>>>                }
>>>>>> +
>>>>>> +        if ( curr->arch.vm_event &&
>>>>>> +            curr->arch.vm_event->send_event &&
>>>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>>>> +        {
>>>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>>>> +            goto out;
>>>>>> +        }
>>>>>
>>>>> Did you notice that there's an immediate exit from the loop only
>>>>> in case the linear -> physical translation fails? This is
>>>>> relevant for page fault delivery correctness for accesses
>>>>> crossing page boundaries. I think you want to use
>>>>> update_map_err() and drop the "goto out". I can't really make up
>>>>
>>>> By update_map_err() are you saying to have the err var assigned and then
>>>> drop "goto out"? If so how do I keep the err from my access violation
>>>> without exiting from the loop?
>>>
>>> Counter question: Why do you _need_ to keep "your" value of err?
>>> If, just as an example, there's going to be a #PF on the other
>>> half of the access, then "your" access violation is of no interest
>>> at all.
>>
>> You are right, there is no need to keep the "goto out" here. It was just
>> for optimization in the idea that there is no need to do further steps
>> but I can drop the "goto out" and the code will work the same.
>>
> 
> There is a problem with dropping the "goto out". If everything goes fine
> then it will return the mapping and I don't want that. This can be
> stopped by checking if ( err ) after the loop and it is not null then
> goto out. And going with this idea I can init *err = NULL and drop the
> err = NULL from hvmemul_map_linear_addr(). Is this ok for the next version?

I'd prefer to see the code to decide. If you want this settled before
sending the next full version, then please send at least the resulting
patch hunk(s).

Jan
Alexandru Stefan ISAILA July 30, 2019, 2:12 p.m. UTC | #16
On 30.07.2019 16:27, Jan Beulich wrote:
> On 30.07.2019 14:21, Alexandru Stefan ISAILA wrote:
>>
>>>>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>>>>         
>>>>>>>                     ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>>>>                 }
>>>>>>> +
>>>>>>> +        if ( curr->arch.vm_event &&
>>>>>>> +            curr->arch.vm_event->send_event &&
>>>>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>>>>> +        {
>>>>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>>>>> +            goto out;
>>>>>>> +        }
>>>>>>
>>>>>> Did you notice that there's an immediate exit from the loop only
>>>>>> in case the linear -> physical translation fails? This is
>>>>>> relevant for page fault delivery correctness for accesses
>>>>>> crossing page boundaries. I think you want to use
>>>>>> update_map_err() and drop the "goto out". I can't really make up
>>>>>
>>>>> By update_map_err() are you saying to have the err var assigned and then
>>>>> drop "goto out"? If so how do I keep the err from my access violation
>>>>> without exiting from the loop?
>>>>
>>>> Counter question: Why do you _need_ to keep "your" value of err?
>>>> If, just as an example, there's going to be a #PF on the other
>>>> half of the access, then "your" access violation is of no interest
>>>> at all.
>>>
>>> You are right, there is no need to keep the "goto out" here. It was just
>>> for optimization in the idea that there is no need to do further steps
>>> but I can drop the "goto out" and the code will work the same.
>>>
>>
>> There is a problem with dropping the "goto out". If everything goes fine
>> then it will return the mapping and I don't want that. This can be
>> stopped by checking if ( err ) after the loop and it is not null then
>> goto out. And going with this idea I can init *err = NULL and drop the
>> err = NULL from hvmemul_map_linear_addr(). Is this ok for the next version?
> 
> I'd prefer to see the code to decide. If you want this settled before
> sending the next full version, then please send at least the resulting
> patch hunk(s).
> 

Here is a diff for hvmemul_map_linear_addr():


diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index d75d3e6fd6..49dbfa730c 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -543,10 +543,11 @@ static void *hvmemul_map_linear_addr(
      struct hvm_emulate_ctxt *hvmemul_ctxt)
  {
      struct vcpu *curr = current;
-    void *err, *mapping;
+    void *err = NULL, *mapping;
      unsigned int nr_frames = ((linear + bytes - !!bytes) >> PAGE_SHIFT) -
          (linear >> PAGE_SHIFT) + 1;
      unsigned int i;
+    gfn_t gfn;

      /*
       * mfn points to the next free slot.  All used slots have a page 
reference
@@ -585,7 +586,7 @@ static void *hvmemul_map_linear_addr(
          ASSERT(mfn_x(*mfn) == 0);

          res = hvm_translate_get_page(curr, addr, true, pfec,
-                                     &pfinfo, &page, NULL, &p2mt);
+                                     &pfinfo, &page, gfn, &p2mt);

          switch ( res )
          {
@@ -599,7 +600,6 @@ static void *hvmemul_map_linear_addr(
              goto out;

          case HVMTRANS_bad_gfn_to_mfn:
-            err = NULL;
              goto out;

          case HVMTRANS_gfn_paged_out:
@@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
              }

              if ( p2mt == p2m_ioreq_server )
-            {
-                err = NULL;
                  goto out;
-            }

              ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
+
+            if ( curr->arch.vm_event &&
+                 curr->arch.vm_event->send_event &&
+                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
+                err = ERR_PTR(~X86EMUL_RETRY);
          }
      }
+    /* Check if any vm_event was sent */
+    if ( err )
+        goto out;

      /* Entire access within a single frame? */
      if ( nr_frames == 1 )


Alex
Jan Beulich July 30, 2019, 2:54 p.m. UTC | #17
On 30.07.2019 16:12, Alexandru Stefan ISAILA wrote:
> 
> 
> On 30.07.2019 16:27, Jan Beulich wrote:
>> On 30.07.2019 14:21, Alexandru Stefan ISAILA wrote:
>>>
>>>>>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>>>>>          
>>>>>>>>                      ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>>>>>                  }
>>>>>>>> +
>>>>>>>> +        if ( curr->arch.vm_event &&
>>>>>>>> +            curr->arch.vm_event->send_event &&
>>>>>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>>>>>> +        {
>>>>>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>>>>>> +            goto out;
>>>>>>>> +        }
>>>>>>>
>>>>>>> Did you notice that there's an immediate exit from the loop only
>>>>>>> in case the linear -> physical translation fails? This is
>>>>>>> relevant for page fault delivery correctness for accesses
>>>>>>> crossing page boundaries. I think you want to use
>>>>>>> update_map_err() and drop the "goto out". I can't really make up
>>>>>>
>>>>>> By update_map_err() are you saying to have the err var assigned and then
>>>>>> drop "goto out"? If so how do I keep the err from my access violation
>>>>>> without exiting from the loop?
>>>>>
>>>>> Counter question: Why do you _need_ to keep "your" value of err?
>>>>> If, just as an example, there's going to be a #PF on the other
>>>>> half of the access, then "your" access violation is of no interest
>>>>> at all.
>>>>
>>>> You are right, there is no need to keep the "goto out" here. It was just
>>>> for optimization in the idea that there is no need to do further steps
>>>> but I can drop the "goto out" and the code will work the same.
>>>>
>>>
>>> There is a problem with dropping the "goto out". If everything goes fine
>>> then it will return the mapping and I don't want that. This can be
>>> stopped by checking if ( err ) after the loop and it is not null then
>>> goto out. And going with this idea I can init *err = NULL and drop the
>>> err = NULL from hvmemul_map_linear_addr(). Is this ok for the next version?
>>
>> I'd prefer to see the code to decide. If you want this settled before
>> sending the next full version, then please send at least the resulting
>> patch hunk(s).
>>
> 
> Here is a diff for hvmemul_map_linear_addr():
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -543,10 +543,11 @@ static void *hvmemul_map_linear_addr(
>        struct hvm_emulate_ctxt *hvmemul_ctxt)
>    {
>        struct vcpu *curr = current;
> -    void *err, *mapping;
> +    void *err = NULL, *mapping;
>        unsigned int nr_frames = ((linear + bytes - !!bytes) >> PAGE_SHIFT) -
>            (linear >> PAGE_SHIFT) + 1;
>        unsigned int i;
> +    gfn_t gfn;
> 
>        /*
>         * mfn points to the next free slot.  All used slots have a page
> reference
> @@ -585,7 +586,7 @@ static void *hvmemul_map_linear_addr(
>            ASSERT(mfn_x(*mfn) == 0);
> 
>            res = hvm_translate_get_page(curr, addr, true, pfec,
> -                                     &pfinfo, &page, NULL, &p2mt);
> +                                     &pfinfo, &page, gfn, &p2mt);
> 
>            switch ( res )
>            {
> @@ -599,7 +600,6 @@ static void *hvmemul_map_linear_addr(
>                goto out;
> 
>            case HVMTRANS_bad_gfn_to_mfn:
> -            err = NULL;
>                goto out;
> 
>            case HVMTRANS_gfn_paged_out:
> @@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
>                }
> 
>                if ( p2mt == p2m_ioreq_server )
> -            {
> -                err = NULL;
>                    goto out;
> -            }
> 
>                ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
> +
> +            if ( curr->arch.vm_event &&
> +                 curr->arch.vm_event->send_event &&
> +                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
> +                err = ERR_PTR(~X86EMUL_RETRY);
>            }
>        }
> +    /* Check if any vm_event was sent */
> +    if ( err )
> +        goto out;
> 
>        /* Entire access within a single frame? */
>        if ( nr_frames == 1 )

First of all I have to apologize: In earlier replies I referred
to update_map_err(). I notice only now that this is a still
pending change of mine, which Andrew continues to object to,
while I continue to think it (in one form or another) is needed:
https://lists.xenproject.org/archives/html/xen-devel/2018-09/msg01250.html

Given the unpatched code, I think your change is correct, but
quite possibly your earlier variant was, too. But since the
unpatched code is imo wrong, I'd prefer if the VM event side
change was put on top of the fixed code, in order to not further
complicate the actual fix (which we may also want to backport).

Andrew, as to that old pending patch, I'm afraid I haven't been
convinced in the slightest by your argumentation, regardless of
the actual behavior of the XTF test you've created. There are
two fundamental points you've not addressed during the earlier
discussion:
1) For a guest behavior should be entirely transparent as far as
2nd level translation goes, unless the _only_ issue results from
it. That's because on bare hardware there simply is no 2nd level
translation.
2) Somewhat related, consider the case of the guest handling the
#PF on the second half of the access by a means which makes the
reason for the 2nd stage "fault" go away, or not recur. In that
case we've wrongly (i.e. at least needlessly) dealt with the 2nd
stage "fault".

I am, btw, not convinced that the behavior as you've observed it
is actually "correct" in the sense of "sensible". But the sub-
thread where I've brought up the behavior of LOCKed accesses has
been left dangling, as much as the other one, ending with me
stating that with the patch in place we'll have less "surprising"
behavior.

Jan
Alexandru Stefan ISAILA July 30, 2019, 3:28 p.m. UTC | #18
On 30.07.2019 17:54, Jan Beulich wrote:
> On 30.07.2019 16:12, Alexandru Stefan ISAILA wrote:
>>
>>
>> On 30.07.2019 16:27, Jan Beulich wrote:
>>> On 30.07.2019 14:21, Alexandru Stefan ISAILA wrote:
>>>>
>>>>>>>>> @@ -629,6 +697,14 @@ static void *hvmemul_map_linear_addr(
>>>>>>>>>           
>>>>>>>>>                       ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>>>>>>>>                   }
>>>>>>>>> +
>>>>>>>>> +        if ( curr->arch.vm_event &&
>>>>>>>>> +            curr->arch.vm_event->send_event &&
>>>>>>>>> +            hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>>>>>>> +        {
>>>>>>>>> +            err = ERR_PTR(~X86EMUL_RETRY);
>>>>>>>>> +            goto out;
>>>>>>>>> +        }
>>>>>>>>
>>>>>>>> Did you notice that there's an immediate exit from the loop only
>>>>>>>> in case the linear -> physical translation fails? This is
>>>>>>>> relevant for page fault delivery correctness for accesses
>>>>>>>> crossing page boundaries. I think you want to use
>>>>>>>> update_map_err() and drop the "goto out". I can't really make up
>>>>>>>
>>>>>>> By update_map_err() are you saying to have the err var assigned and then
>>>>>>> drop "goto out"? If so how do I keep the err from my access violation
>>>>>>> without exiting from the loop?
>>>>>>
>>>>>> Counter question: Why do you _need_ to keep "your" value of err?
>>>>>> If, just as an example, there's going to be a #PF on the other
>>>>>> half of the access, then "your" access violation is of no interest
>>>>>> at all.
>>>>>
>>>>> You are right, there is no need to keep the "goto out" here. It was just
>>>>> for optimization in the idea that there is no need to do further steps
>>>>> but I can drop the "goto out" and the code will work the same.
>>>>>
>>>>
>>>> There is a problem with dropping the "goto out". If everything goes fine
>>>> then it will return the mapping and I don't want that. This can be
>>>> stopped by checking if ( err ) after the loop and it is not null then
>>>> goto out. And going with this idea I can init *err = NULL and drop the
>>>> err = NULL from hvmemul_map_linear_addr(). Is this ok for the next version?
>>>
>>> I'd prefer to see the code to decide. If you want this settled before
>>> sending the next full version, then please send at least the resulting
>>> patch hunk(s).
>>>
>>
>> Here is a diff for hvmemul_map_linear_addr():
>>
>> --- a/xen/arch/x86/hvm/emulate.c
>> +++ b/xen/arch/x86/hvm/emulate.c
>> @@ -543,10 +543,11 @@ static void *hvmemul_map_linear_addr(
>>         struct hvm_emulate_ctxt *hvmemul_ctxt)
>>     {
>>         struct vcpu *curr = current;
>> -    void *err, *mapping;
>> +    void *err = NULL, *mapping;
>>         unsigned int nr_frames = ((linear + bytes - !!bytes) >> PAGE_SHIFT) -
>>             (linear >> PAGE_SHIFT) + 1;
>>         unsigned int i;
>> +    gfn_t gfn;
>>
>>         /*
>>          * mfn points to the next free slot.  All used slots have a page
>> reference
>> @@ -585,7 +586,7 @@ static void *hvmemul_map_linear_addr(
>>             ASSERT(mfn_x(*mfn) == 0);
>>
>>             res = hvm_translate_get_page(curr, addr, true, pfec,
>> -                                     &pfinfo, &page, NULL, &p2mt);
>> +                                     &pfinfo, &page, gfn, &p2mt);
>>
>>             switch ( res )
>>             {
>> @@ -599,7 +600,6 @@ static void *hvmemul_map_linear_addr(
>>                 goto out;
>>
>>             case HVMTRANS_bad_gfn_to_mfn:
>> -            err = NULL;
>>                 goto out;
>>
>>             case HVMTRANS_gfn_paged_out:
>> @@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
>>                 }
>>
>>                 if ( p2mt == p2m_ioreq_server )
>> -            {
>> -                err = NULL;
>>                     goto out;
>> -            }
>>
>>                 ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>> +
>> +            if ( curr->arch.vm_event &&
>> +                 curr->arch.vm_event->send_event &&
>> +                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
>> +                err = ERR_PTR(~X86EMUL_RETRY);
>>             }
>>         }
>> +    /* Check if any vm_event was sent */
>> +    if ( err )
>> +        goto out;
>>
>>         /* Entire access within a single frame? */
>>         if ( nr_frames == 1 )
> 
> First of all I have to apologize: In earlier replies I referred
> to update_map_err(). I notice only now that this is a still
> pending change of mine, which Andrew continues to object to,
> while I continue to think it (in one form or another) is needed:
> https://lists.xenproject.org/archives/html/xen-devel/2018-09/msg01250.html
> 
> Given the unpatched code, I think your change is correct, but
> quite possibly your earlier variant was, too. But since the
> unpatched code is imo wrong, I'd prefer if the VM event side
> change was put on top of the fixed code, in order to not further
> complicate the actual fix (which we may also want to backport).

Thanks for the clarification, I will have to wait for this patch to go 
in and then submit another version. This new function will work with my 
new error.

Alex
Andrew Cooper Aug. 20, 2019, 8:11 p.m. UTC | #19
On 30/07/2019 15:54, Jan Beulich wrote:
>> @@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
>>                }
>>
>>                if ( p2mt == p2m_ioreq_server )
>> -            {
>> -                err = NULL;
>>                    goto out;
>> -            }
>>
>>                ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>> +
>> +            if ( curr->arch.vm_event &&
>> +                 curr->arch.vm_event->send_event &&
>> +                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
>> +                err = ERR_PTR(~X86EMUL_RETRY);
>>            }
>>        }
>> +    /* Check if any vm_event was sent */
>> +    if ( err )
>> +        goto out;
>>
>>        /* Entire access within a single frame? */
>>        if ( nr_frames == 1 )
> First of all I have to apologize: In earlier replies I referred
> to update_map_err(). I notice only now that this is a still
> pending change of mine, which Andrew continues to object to,
> while I continue to think it (in one form or another) is needed:
> https://lists.xenproject.org/archives/html/xen-devel/2018-09/msg01250.html
>
> Given the unpatched code, I think your change is correct, but
> quite possibly your earlier variant was, too. But since the
> unpatched code is imo wrong, I'd prefer if the VM event side
> change was put on top of the fixed code, in order to not further
> complicate the actual fix (which we may also want to backport).
>
> Andrew, as to that old pending patch, I'm afraid I haven't been
> convinced in the slightest by your argumentation, regardless of
> the actual behavior of the XTF test you've created.

So what?  You want your change taken anyway despite evidence that it is
wrong?

>  There are
> two fundamental points you've not addressed during the earlier
> discussion:
> 1) For a guest behavior should be entirely transparent as far as
> 2nd level translation goes, unless the _only_ issue results from
> it. That's because on bare hardware there simply is no 2nd level
> translation.
> 2) Somewhat related, consider the case of the guest handling the
> #PF on the second half of the access by a means which makes the
> reason for the 2nd stage "fault" go away, or not recur. In that
> case we've wrongly (i.e. at least needlessly) dealt with the 2nd
> stage "fault".

For both of these, do you actually have an example where you believe
Xen's logic currently goes wrong?  All I see, looking though the
threads, is unsubstantiated claims that the current behaviour is wrong.

> I am, btw, not convinced that the behavior as you've observed it
> is actually "correct" in the sense of "sensible".

You're entitled to the believe that this isn't sensible if you wish.

It doesn't make it relevant to the argument.  Relevant arguments would
be identifying, a bug in my XTF test, or counterexample where the CPUs
do an opposite thing, or a passage in a spec which make a statement
supporting your claim.

As far as I am concerned, it is perfectly sensible and logical
behaviour.  To complete an instruction which straddles a page boundary,
it is necessary to have both translations available in the TLB, which
requires two EPT-walks to have already completed correctly.

SDM Vol 3 28.2.3.3 is very clear on the matter.  All translations to the
ultimate physical addresses get established first (I.e. the TLB fills
complete) before any access rights get considered.  This means that
ordering of #PF and EPT misconfig/violation is complicated by their dual
nature for failures.

In reality, I think the current code in Xen will get the priority of
second and first stage access right fault inverted, but its a damn sight
closer to how the CPU behaves than the proposed patch, which would get
first staged access rights mixed up with second stage translation faults.

Having looked over everything yet again, I stand by my very first
conclusion of that change being incorrect.

~Andrew
Jan Beulich Aug. 27, 2019, 8:26 a.m. UTC | #20
On 20.08.2019 22:11, Andrew Cooper wrote:
> On 30/07/2019 15:54, Jan Beulich wrote:
>>> @@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
>>>                 }
>>>
>>>                 if ( p2mt == p2m_ioreq_server )
>>> -            {
>>> -                err = NULL;
>>>                     goto out;
>>> -            }
>>>
>>>                 ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
>>> +
>>> +            if ( curr->arch.vm_event &&
>>> +                 curr->arch.vm_event->send_event &&
>>> +                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>> +                err = ERR_PTR(~X86EMUL_RETRY);
>>>             }
>>>         }
>>> +    /* Check if any vm_event was sent */
>>> +    if ( err )
>>> +        goto out;
>>>
>>>         /* Entire access within a single frame? */
>>>         if ( nr_frames == 1 )
>> First of all I have to apologize: In earlier replies I referred
>> to update_map_err(). I notice only now that this is a still
>> pending change of mine, which Andrew continues to object to,
>> while I continue to think it (in one form or another) is needed:
>> https://lists.xenproject.org/archives/html/xen-devel/2018-09/msg01250.html
>>
>> Given the unpatched code, I think your change is correct, but
>> quite possibly your earlier variant was, too. But since the
>> unpatched code is imo wrong, I'd prefer if the VM event side
>> change was put on top of the fixed code, in order to not further
>> complicate the actual fix (which we may also want to backport).
>>
>> Andrew, as to that old pending patch, I'm afraid I haven't been
>> convinced in the slightest by your argumentation, regardless of
>> the actual behavior of the XTF test you've created.
> 
> So what?  You want your change taken anyway despite evidence that it is
> wrong?
> 
>>   There are
>> two fundamental points you've not addressed during the earlier
>> discussion:
>> 1) For a guest behavior should be entirely transparent as far as
>> 2nd level translation goes, unless the _only_ issue results from
>> it. That's because on bare hardware there simply is no 2nd level
>> translation.
>> 2) Somewhat related, consider the case of the guest handling the
>> #PF on the second half of the access by a means which makes the
>> reason for the 2nd stage "fault" go away, or not recur. In that
>> case we've wrongly (i.e. at least needlessly) dealt with the 2nd
>> stage "fault".
> 
> For both of these, do you actually have an example where you believe
> Xen's logic currently goes wrong?  All I see, looking though the
> threads, is unsubstantiated claims that the current behaviour is wrong.

Hmm, I thought we're both still recalling the case this started from:
ballooned-out page handling kicking in when the guest expects a page
fault (based on its own page tables).

>> I am, btw, not convinced that the behavior as you've observed it
>> is actually "correct" in the sense of "sensible".
> 
> You're entitled to the believe that this isn't sensible if you wish.
> 
> It doesn't make it relevant to the argument.  Relevant arguments would
> be identifying, a bug in my XTF test, or counterexample where the CPUs
> do an opposite thing, or a passage in a spec which make a statement
> supporting your claim.
> 
> As far as I am concerned, it is perfectly sensible and logical
> behaviour.  To complete an instruction which straddles a page boundary,
> it is necessary to have both translations available in the TLB, which
> requires two EPT-walks to have already completed correctly.
> 
> SDM Vol 3 28.2.3.3 is very clear on the matter.  All translations to the
> ultimate physical addresses get established first (I.e. the TLB fills
> complete) before any access rights get considered.  This means that
> ordering of #PF and EPT misconfig/violation is complicated by their dual
> nature for failures.
> 
> In reality, I think the current code in Xen will get the priority of
> second and first stage access right fault inverted, but its a damn sight
> closer to how the CPU behaves than the proposed patch, which would get
> first staged access rights mixed up with second stage translation faults.

I consider your position as perfectly valid to take. It's just that, as
in so many other cases, it's not the only valid one (imo). You judge
from observed behavior, which is fine. You don't, however, address my
argument of there not being 2nd stage translation at all from guest
pov: The change made results in the expected behavior if there was no
2nd stage translation. And it is my view of virtualization that the
goal should be to provide guest visible behavior matching the
unvirtualized case as much as possible.

Jan
Alexandru Stefan ISAILA Sept. 2, 2019, 2:36 p.m. UTC | #21
On 27.08.2019 11:26, Jan Beulich wrote:
> On 20.08.2019 22:11, Andrew Cooper wrote:
>> On 30/07/2019 15:54, Jan Beulich wrote:
>>>> @@ -622,14 +622,22 @@ static void *hvmemul_map_linear_addr(
>>>>                 }
>>>>
>>>>                 if ( p2mt == p2m_ioreq_server )
>>>> -            {
>>>> -                err = NULL;
>>>>                     goto out;
>>>> -            }
>>>>
>>>>                 ASSERT(p2mt == p2m_ram_logdirty || 
>>>> !p2m_is_readonly(p2mt));
>>>> +
>>>> +            if ( curr->arch.vm_event &&
>>>> +                 curr->arch.vm_event->send_event &&
>>>> +                 hvm_emulate_send_vm_event(addr, gfn, pfec) )
>>>> +                err = ERR_PTR(~X86EMUL_RETRY);
>>>>             }
>>>>         }
>>>> +    /* Check if any vm_event was sent */
>>>> +    if ( err )
>>>> +        goto out;
>>>>
>>>>         /* Entire access within a single frame? */
>>>>         if ( nr_frames == 1 )
>>> First of all I have to apologize: In earlier replies I referred
>>> to update_map_err(). I notice only now that this is a still
>>> pending change of mine, which Andrew continues to object to,
>>> while I continue to think it (in one form or another) is needed:
>>> https://lists.xenproject.org/archives/html/xen-devel/2018-09/msg01250.html 
>>>
>>>
>>> Given the unpatched code, I think your change is correct, but
>>> quite possibly your earlier variant was, too. But since the
>>> unpatched code is imo wrong, I'd prefer if the VM event side
>>> change was put on top of the fixed code, in order to not further
>>> complicate the actual fix (which we may also want to backport).
>>>
>>> Andrew, as to that old pending patch, I'm afraid I haven't been
>>> convinced in the slightest by your argumentation, regardless of
>>> the actual behavior of the XTF test you've created.
>>
>> So what?  You want your change taken anyway despite evidence that it is
>> wrong?
>>
>>>   There are
>>> two fundamental points you've not addressed during the earlier
>>> discussion:
>>> 1) For a guest behavior should be entirely transparent as far as
>>> 2nd level translation goes, unless the _only_ issue results from
>>> it. That's because on bare hardware there simply is no 2nd level
>>> translation.
>>> 2) Somewhat related, consider the case of the guest handling the
>>> #PF on the second half of the access by a means which makes the
>>> reason for the 2nd stage "fault" go away, or not recur. In that
>>> case we've wrongly (i.e. at least needlessly) dealt with the 2nd
>>> stage "fault".
>>
>> For both of these, do you actually have an example where you believe
>> Xen's logic currently goes wrong?  All I see, looking though the
>> threads, is unsubstantiated claims that the current behaviour is wrong.
> 
> Hmm, I thought we're both still recalling the case this started from:
> ballooned-out page handling kicking in when the guest expects a page
> fault (based on its own page tables).
> 
>>> I am, btw, not convinced that the behavior as you've observed it
>>> is actually "correct" in the sense of "sensible".
>>
>> You're entitled to the believe that this isn't sensible if you wish.
>>
>> It doesn't make it relevant to the argument.  Relevant arguments would
>> be identifying, a bug in my XTF test, or counterexample where the CPUs
>> do an opposite thing, or a passage in a spec which make a statement
>> supporting your claim.
>>
>> As far as I am concerned, it is perfectly sensible and logical
>> behaviour.  To complete an instruction which straddles a page boundary,
>> it is necessary to have both translations available in the TLB, which
>> requires two EPT-walks to have already completed correctly.
>>
>> SDM Vol 3 28.2.3.3 is very clear on the matter.  All translations to the
>> ultimate physical addresses get established first (I.e. the TLB fills
>> complete) before any access rights get considered.  This means that
>> ordering of #PF and EPT misconfig/violation is complicated by their dual
>> nature for failures.
>>
>> In reality, I think the current code in Xen will get the priority of
>> second and first stage access right fault inverted, but its a damn sight
>> closer to how the CPU behaves than the proposed patch, which would get
>> first staged access rights mixed up with second stage translation faults.
> 
> I consider your position as perfectly valid to take. It's just that, as
> in so many other cases, it's not the only valid one (imo). You judge
> from observed behavior, which is fine. You don't, however, address my
> argument of there not being 2nd stage translation at all from guest
> pov: The change made results in the expected behavior if there was no
> 2nd stage translation. And it is my view of virtualization that the
> goal should be to provide guest visible behavior matching the
> unvirtualized case as much as possible.
> 

Hi Jan, Andrew,

Is there a way we can go on with this issue?

Regards,
Alex
Jan Beulich Sept. 2, 2019, 2:59 p.m. UTC | #22
On 02.09.2019 16:36, Alexandru Stefan ISAILA wrote:
> Is there a way we can go on with this issue?

As long as Andrew wouldn't change his mind, all I can suggest is
that you avoid making your change dependent upon mine. If I (again)
end up reviewing it, I'll have to keep in mind to judge on it using
plain master, not my own tree.

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 8659c89862..cb47e430af 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -12,9 +12,11 @@ 
 #include <xen/init.h>
 #include <xen/lib.h>
 #include <xen/sched.h>
+#include <xen/monitor.h>
 #include <xen/paging.h>
 #include <xen/trace.h>
 #include <xen/vm_event.h>
+#include <asm/altp2m.h>
 #include <asm/event.h>
 #include <asm/i387.h>
 #include <asm/xstate.h>
@@ -530,6 +532,71 @@  static int hvmemul_do_mmio_addr(paddr_t mmio_gpa,
     return hvmemul_do_io_addr(1, mmio_gpa, reps, size, dir, df, ram_gpa);
 }
 
+/*
+ * Send memory access vm_events based on pfec. Returns true if the event was
+ * sent and false for p2m_get_mem_access() error, no violation and event send
+ * error. Depends on arch.vm_event->send_event.
+ *
+ * NOTE: p2m_get_mem_access() can fail for wrong address or if the entry
+ * was not found in the EPT (in which case access to it is unrestricted, so
+ * no violations can occur). In both cases it is fine to continue the
+ * emulation.
+ */
+bool hvm_emulate_send_vm_event(unsigned long gla, gfn_t gfn,
+                               uint32_t pfec)
+{
+    xenmem_access_t access;
+    vm_event_request_t req = {};
+    paddr_t gpa = (gfn_to_gaddr(gfn) | (gla & ~PAGE_MASK));
+
+    current->arch.vm_event->send_event = false;
+
+    if ( p2m_get_mem_access(current->domain, gfn, &access,
+                            altp2m_vcpu_idx(current)) != 0 )
+        return false;
+
+    switch ( access )
+    {
+    case XENMEM_access_x:
+    case XENMEM_access_rx:
+        if ( pfec & PFEC_write_access )
+            req.u.mem_access.flags = MEM_ACCESS_R | MEM_ACCESS_W;
+        break;
+
+    case XENMEM_access_w:
+    case XENMEM_access_rw:
+        if ( pfec & PFEC_insn_fetch )
+            req.u.mem_access.flags = MEM_ACCESS_X;
+        break;
+
+    case XENMEM_access_r:
+    case XENMEM_access_n:
+        if ( pfec & PFEC_write_access )
+            req.u.mem_access.flags |= MEM_ACCESS_R | MEM_ACCESS_W;
+        if ( pfec & PFEC_insn_fetch )
+            req.u.mem_access.flags |= MEM_ACCESS_X;
+        break;
+
+    case XENMEM_access_wx:
+    case XENMEM_access_rwx:
+    case XENMEM_access_rx2rw:
+    case XENMEM_access_n2rwx:
+    case XENMEM_access_default:
+        break;
+    }
+
+    if ( !req.u.mem_access.flags )
+        return false; /* no violation */
+
+    req.reason = VM_EVENT_REASON_MEM_ACCESS;
+    req.u.mem_access.gfn = gfn_x(gfn);
+    req.u.mem_access.flags |= MEM_ACCESS_FAULT_WITH_GLA | MEM_ACCESS_GLA_VALID;
+    req.u.mem_access.gla = gla;
+    req.u.mem_access.offset = gpa & ~PAGE_MASK;
+
+    return monitor_traps(current, true, &req) >= 0;
+}
+
 /*
  * Map the frame(s) covering an individual linear access, for writeable
  * access.  May return NULL for MMIO, or ERR_PTR(~X86EMUL_*) for other errors
@@ -547,6 +614,7 @@  static void *hvmemul_map_linear_addr(
     unsigned int nr_frames = ((linear + bytes - !!bytes) >> PAGE_SHIFT) -
         (linear >> PAGE_SHIFT) + 1;
     unsigned int i;
+    gfn_t gfn;
 
     /*
      * mfn points to the next free slot.  All used slots have a page reference
@@ -585,7 +653,7 @@  static void *hvmemul_map_linear_addr(
         ASSERT(mfn_x(*mfn) == 0);
 
         res = hvm_translate_get_page(curr, addr, true, pfec,
-                                     &pfinfo, &page, NULL, &p2mt);
+                                     &pfinfo, &page, &gfn, &p2mt);
 
         switch ( res )
         {
@@ -629,6 +697,14 @@  static void *hvmemul_map_linear_addr(
 
             ASSERT(p2mt == p2m_ram_logdirty || !p2m_is_readonly(p2mt));
         }
+
+        if ( curr->arch.vm_event &&
+            curr->arch.vm_event->send_event &&
+            hvm_emulate_send_vm_event(addr, gfn, pfec) )
+        {
+            err = ERR_PTR(~X86EMUL_RETRY);
+            goto out;
+        }
     }
 
     /* Entire access within a single frame? */
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 029eea3b85..783ebc3525 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3224,6 +3224,14 @@  static enum hvm_translation_result __hvm_copy(
             return HVMTRANS_bad_gfn_to_mfn;
         }
 
+        if ( unlikely(v->arch.vm_event) &&
+            v->arch.vm_event->send_event &&
+            hvm_emulate_send_vm_event(addr, gfn, pfec) )
+        {
+            put_page(page);
+            return HVMTRANS_gfn_paged_out;
+        }
+
         p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK);
 
         if ( flags & HVMCOPY_to_guest )
diff --git a/xen/arch/x86/hvm/vm_event.c b/xen/arch/x86/hvm/vm_event.c
index 121de23071..dede21e1c9 100644
--- a/xen/arch/x86/hvm/vm_event.c
+++ b/xen/arch/x86/hvm/vm_event.c
@@ -86,6 +86,7 @@  void hvm_vm_event_do_resume(struct vcpu *v)
                   VM_EVENT_FLAG_SET_EMUL_INSN_DATA )
             kind = EMUL_KIND_SET_CONTEXT_INSN;
 
+        v->arch.vm_event->send_event = false;
         hvm_emulate_one_vm_event(kind, TRAP_invalid_op,
                                  X86_EVENT_NO_EC);
 
diff --git a/xen/arch/x86/mm/mem_access.c b/xen/arch/x86/mm/mem_access.c
index 0144f92b98..c0faa57db1 100644
--- a/xen/arch/x86/mm/mem_access.c
+++ b/xen/arch/x86/mm/mem_access.c
@@ -214,6 +214,7 @@  bool p2m_mem_access_check(paddr_t gpa, unsigned long gla,
          d->arch.monitor.inguest_pagefault_disabled &&
          npfec.kind != npfec_kind_with_gla ) /* don't send a mem_event */
     {
+        v->arch.vm_event->send_event = true;
         hvm_emulate_one_vm_event(EMUL_KIND_NORMAL, TRAP_invalid_op, X86_EVENT_NO_EC);
 
         return true;
diff --git a/xen/include/asm-x86/hvm/emulate.h b/xen/include/asm-x86/hvm/emulate.h
index b39a1a0331..3682efd90b 100644
--- a/xen/include/asm-x86/hvm/emulate.h
+++ b/xen/include/asm-x86/hvm/emulate.h
@@ -80,6 +80,10 @@  struct segment_register *hvmemul_get_seg_reg(
     enum x86_segment seg,
     struct hvm_emulate_ctxt *hvmemul_ctxt);
 int hvm_emulate_one_mmio(unsigned long mfn, unsigned long gla);
+bool hvm_emulate_send_vm_event(
+    unsigned long gla,
+    gfn_t gfn,
+    uint32_t pfec);
 
 static inline bool handle_mmio(void)
 {
diff --git a/xen/include/asm-x86/vm_event.h b/xen/include/asm-x86/vm_event.h
index 23e655710b..66db9e1e25 100644
--- a/xen/include/asm-x86/vm_event.h
+++ b/xen/include/asm-x86/vm_event.h
@@ -36,6 +36,8 @@  struct arch_vm_event {
     bool set_gprs;
     /* A sync vm_event has been sent and we're not done handling it. */
     bool sync_event;
+    /* Send mem access events from emulator */
+    bool send_event;
 };
 
 int vm_event_init_domain(struct domain *d);