diff mbox

[1/2] KVM: fix cache stale memslot info with correct mmio generation number

Message ID 1407999713-3726-1-git-send-email-xiaoguangrong@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Xiao Guangrong Aug. 14, 2014, 7:01 a.m. UTC
We may cache the current mmio generation number and stale memslot info
into spte, like this scenario?

       CPU 0                                  CPU 1
page fault:                                add a new memslot
read memslot and detecting its a mmio access
                                           update memslots
                                           update generation number
read generation number
cache the gpa and current gen number into spte

So, if guest accesses the gpa later, it will generate a incorrect
mmio exit

This patch fixes it by updating the generation number after
synchronize_srcu_expedited() that makes sure the generation
number updated only if memslots update is finished

Cc: stable@vger.kernel.org
Cc: David Matlack <dmatlack@google.com>
Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
---
 virt/kvm/kvm_main.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

Comments

Xiao Guangrong Aug. 14, 2014, 7:06 a.m. UTC | #1
Sorry, the title is not clear enough.

This is the v2 which fixes the issue pointed out by David:
" the generation number actually decreases."

Please review.

On 08/14/2014 03:01 PM, Xiao Guangrong wrote:
> We may cache the current mmio generation number and stale memslot info
> into spte, like this scenario?
> 
>        CPU 0                                  CPU 1
> page fault:                                add a new memslot
> read memslot and detecting its a mmio access
>                                            update memslots
>                                            update generation number
> read generation number
> cache the gpa and current gen number into spte
> 
> So, if guest accesses the gpa later, it will generate a incorrect
> mmio exit
> 
> This patch fixes it by updating the generation number after
> synchronize_srcu_expedited() that makes sure the generation
> number updated only if memslots update is finished
> 
> Cc: stable@vger.kernel.org
> Cc: David Matlack <dmatlack@google.com>
> Signed-off-by: Xiao Guangrong <xiaoguangrong@linux.vnet.ibm.com>
> ---
>  virt/kvm/kvm_main.c | 12 ++++++------
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
> index 33712fb..bb40df3 100644
> --- a/virt/kvm/kvm_main.c
> +++ b/virt/kvm/kvm_main.c
> @@ -96,7 +96,7 @@ static void hardware_disable_all(void);
> 
>  static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
>  static void update_memslots(struct kvm_memslots *slots,
> -			    struct kvm_memory_slot *new, u64 last_generation);
> +			    struct kvm_memory_slot *new);
> 
>  static void kvm_release_pfn_dirty(pfn_t pfn);
>  static void mark_page_dirty_in_slot(struct kvm *kvm,
> @@ -687,8 +687,7 @@ static void sort_memslots(struct kvm_memslots *slots)
>  }
> 
>  static void update_memslots(struct kvm_memslots *slots,
> -			    struct kvm_memory_slot *new,
> -			    u64 last_generation)
> +			    struct kvm_memory_slot *new)
>  {
>  	if (new) {
>  		int id = new->id;
> @@ -699,8 +698,6 @@ static void update_memslots(struct kvm_memslots *slots,
>  		if (new->npages != npages)
>  			sort_memslots(slots);
>  	}
> -
> -	slots->generation = last_generation + 1;
>  }
> 
>  static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
> @@ -722,9 +719,12 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
>  {
>  	struct kvm_memslots *old_memslots = kvm->memslots;
> 
> -	update_memslots(slots, new, kvm->memslots->generation);
> +	/* ensure generation number is always increased. */
> +	slots->generation = old_memslots->generation;
> +	update_memslots(slots, new);
>  	rcu_assign_pointer(kvm->memslots, slots);
>  	synchronize_srcu_expedited(&kvm->srcu);
> +	slots->generation++;
> 
>  	kvm_arch_memslots_updated(kvm);
> 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 18, 2014, 1:57 p.m. UTC | #2
Il 14/08/2014 09:01, Xiao Guangrong ha scritto:
> -	update_memslots(slots, new, kvm->memslots->generation);
> +	/* ensure generation number is always increased. */
> +	slots->generation = old_memslots->generation;
> +	update_memslots(slots, new);
>  	rcu_assign_pointer(kvm->memslots, slots);
>  	synchronize_srcu_expedited(&kvm->srcu);
> +	slots->generation++;

I don't trust my brain enough to review this patch.

kvm_current_mmio_generation seems like a very bad (race-prone) API.  One
patch I trust myself reviewing would change a bunch of functions in
kvm_main.c to take a memslots struct.  This would make it easy to
respect the hard and fast rule of not dereferencing the same pointer
twice.  But it would be a tedious change.

Another alternative could be to use the low bit to mark an in-progress
change, and skip the caching if the low bit is set.  Similar to a
seqcount (except if read_seqcount_retry fails, we just punt and not
retry anything), you could use it even though the memory barriers
provided by write_seqcount_begin/end are not too useful in this case.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Xiao Guangrong Aug. 18, 2014, 4:35 p.m. UTC | #3
Hi Paolo,

Thank you to review the patch!

On Aug 18, 2014, at 9:57 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:

> Il 14/08/2014 09:01, Xiao Guangrong ha scritto:
>> -	update_memslots(slots, new, kvm->memslots->generation);
>> +	/* ensure generation number is always increased. */
>> +	slots->generation = old_memslots->generation;
>> +	update_memslots(slots, new);
>> 	rcu_assign_pointer(kvm->memslots, slots);
>> 	synchronize_srcu_expedited(&kvm->srcu);
>> +	slots->generation++;
> 
> I don't trust my brain enough to review this patch.

Sorry to make you confused. I should expain it more clearly.

What this patch tried to fix is:  kvm will generate wrong mmio-exit forever
if no luck event cleans mmio spte. (eg. if no memory pressure or  no
context-sync on that spte.)

Note, it is hard to do precise sync between kvm_vm_ioctl_set_memory_region
and mmio-exit - that means userspace is able to get mmio-exit even if
kvm_vm_ioctl_set_memory_region have finished, for example, kvm identifies
a mmio access before issuing the ioctl and injects mmio-exit to userspace after
ioctl return. So checking if mmio-exit is a real mmio access in userspace is
needed anyway.

> kvm_current_mmio_generation seems like a very bad (race-prone) API.  One
> patch I trust myself reviewing would change a bunch of functions in
> kvm_main.c to take a memslots struct.  This would make it easy to
> respect the hard and fast rule of not dereferencing the same pointer
> twice.  But it would be a tedious change.

kvm_set_memory_region is the only place updating memslot and
kvm_current_mmio_generation accesses memslot by rcu-dereference,
i do not know why other places need to take into account.

I think this patch is auditable, page-fault is always called by holding
srcu-lock so that a page fault can’t go across synchronize_srcu_expedited.
Only these cases can happen:

1)  page fault occurs before synchronize_srcu_expedited.
    In this case, vcpu will generate mmio-exit for the memslot being registered
    by the ioctl. That’s ok since the ioctl have not finished.

2) page fault occurs after synchronize_srcu_expedited and during
   increasing generation-number.
   In this case, userspace may get wrong mmio-exit (that happen if handing
   page-fault is slower that the ioctl), that’s ok too since userspace need do
  the check anyway like i said above.

3) page fault occurs after generation-number update
   that’s definitely correct. :)

> Another alternative could be to use the low bit to mark an in-progress
> change, and skip the caching if the low bit is set.  Similar to a
> seqcount (except if read_seqcount_retry fails, we just punt and not
> retry anything), you could use it even though the memory barriers
> provided by write_seqcount_begin/end are not too useful in this case.

I do not know how the bit works, page fault will cache the memslot before
the bit set and cache the generation-number after the bit set.

Maybe i missed your idea, could you please detail it?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
David Matlack Aug. 18, 2014, 6:20 p.m. UTC | #4
On Mon, Aug 18, 2014 at 9:35 AM, Xiao Guangrong
<xiaoguangrong.eric@gmail.com> wrote:
>
> Hi Paolo,
>
> Thank you to review the patch!
>
> On Aug 18, 2014, at 9:57 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
>
>> Il 14/08/2014 09:01, Xiao Guangrong ha scritto:
>>> -    update_memslots(slots, new, kvm->memslots->generation);
>>> +    /* ensure generation number is always increased. */
>>> +    slots->generation = old_memslots->generation;
>>> +    update_memslots(slots, new);
>>>      rcu_assign_pointer(kvm->memslots, slots);
>>>      synchronize_srcu_expedited(&kvm->srcu);
>>> +    slots->generation++;
>>
>> I don't trust my brain enough to review this patch.

Xiao, I thought about your approach a lot and I can't think of a bad race
that isn't already possible due to the fact that kvm allows memslot
mutation to race with vm exits. That being said, it's hard to reason about
all the other "clients" of memslots and what weirdness (or badness) will
be caused by updating generation after srcu_synch. I like Paolo's two
approaches because they fix the bug without any side-effects.

> Sorry to make you confused. I should expain it more clearly.
>
> What this patch tried to fix is:  kvm will generate wrong mmio-exit forever
> if no luck event cleans mmio spte. (eg. if no memory pressure or  no
> context-sync on that spte.)
>
> Note, it is hard to do precise sync between kvm_vm_ioctl_set_memory_region
> and mmio-exit - that means userspace is able to get mmio-exit even if
> kvm_vm_ioctl_set_memory_region have finished, for example, kvm identifies
> a mmio access before issuing the ioctl and injects mmio-exit to userspace after
> ioctl return. So checking if mmio-exit is a real mmio access in userspace is
> needed anyway.
>
>> kvm_current_mmio_generation seems like a very bad (race-prone) API.  One
>> patch I trust myself reviewing would change a bunch of functions in
>> kvm_main.c to take a memslots struct.  This would make it easy to
>> respect the hard and fast rule of not dereferencing the same pointer
>> twice.  But it would be a tedious change.
>
> kvm_set_memory_region is the only place updating memslot and
> kvm_current_mmio_generation accesses memslot by rcu-dereference,
> i do not know why other places need to take into account.

If you rcu_dereference() more than once, you can't trust previous
decisions based on rcu_dereference()'s. If the mmio cache code only
did one rcu_dereference() per vm exit, this bug would be gone.

> I think this patch is auditable, page-fault is always called by holding
> srcu-lock so that a page fault can’t go across synchronize_srcu_expedited.
> Only these cases can happen:
>
> 1)  page fault occurs before synchronize_srcu_expedited.
>     In this case, vcpu will generate mmio-exit for the memslot being registered
>     by the ioctl. That’s ok since the ioctl have not finished.
>
> 2) page fault occurs after synchronize_srcu_expedited and during
>    increasing generation-number.
>    In this case, userspace may get wrong mmio-exit (that happen if handing
>    page-fault is slower that the ioctl), that’s ok too since userspace need do
>   the check anyway like i said above.
>
> 3) page fault occurs after generation-number update
>    that’s definitely correct. :)
>
>> Another alternative could be to use the low bit to mark an in-progress
>> change, and skip the caching if the low bit is set.  Similar to a
>> seqcount (except if read_seqcount_retry fails, we just punt and not
>> retry anything), you could use it even though the memory barriers
>> provided by write_seqcount_begin/end are not too useful in this case.

I like this approach best. It would have the least code changes and provide
the same guarantees.

> I do not know how the bit works, page fault will cache the memslot before
> the bit set and cache the generation-number after the bit set.
>
> Maybe i missed your idea, could you please detail it?

In vcpu_cache_mmio_info() if generation is odd, just don't do the caching
because memslots were changed while we were running and we just assume the
worst case.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Paolo Bonzini Aug. 18, 2014, 6:47 p.m. UTC | #5
Il 18/08/2014 18:35, Xiao Guangrong ha scritto:
> 
> Hi Paolo,
> 
> Thank you to review the patch!
> 
> On Aug 18, 2014, at 9:57 PM, Paolo Bonzini <pbonzini@redhat.com> wrote:
> 
>> Il 14/08/2014 09:01, Xiao Guangrong ha scritto:
>>> -	update_memslots(slots, new, kvm->memslots->generation);
>>> +	/* ensure generation number is always increased. */
>>> +	slots->generation = old_memslots->generation;
>>> +	update_memslots(slots, new);
>>> 	rcu_assign_pointer(kvm->memslots, slots);
>>> 	synchronize_srcu_expedited(&kvm->srcu);
>>> +	slots->generation++;
>>
>> I don't trust my brain enough to review this patch.
> 
> Sorry to make you confused. I should expain it more clearly.

Don't worry, it's not your fault. :)

>> kvm_current_mmio_generation seems like a very bad (race-prone) API.  One
>> patch I trust myself reviewing would change a bunch of functions in
>> kvm_main.c to take a memslots struct.  This would make it easy to
>> respect the hard and fast rule of not dereferencing the same pointer
>> twice.  But it would be a tedious change.
> 
> kvm_set_memory_region is the only place updating memslot and
> kvm_current_mmio_generation accesses memslot by rcu-dereference,
> i do not know why other places need to take into account.

The race occurs because gfn_to_pfn_many_atomic or some other function
has already used kvm_memslots().  Calling kvm_memslots() twice is the
root cause the bug.

> I think this patch is auditable, page-fault is always called by holding
> srcu-lock so that a page fault can’t go across synchronize_srcu_expedited.
> Only these cases can happen:
> 
> 1)  page fault occurs before synchronize_srcu_expedited.
>     In this case, vcpu will generate mmio-exit for the memslot being registered
>     by the ioctl. That’s ok since the ioctl have not finished.
> 
> 2) page fault occurs after synchronize_srcu_expedited and during
>    increasing generation-number.
>    In this case, userspace may get wrong mmio-exit (that happen if handing
>    page-fault is slower that the ioctl), that’s ok too since userspace need do
>   the check anyway like i said above.
> 
> 3) page fault occurs after generation-number update
>    that’s definitely correct. :)
> 
>> Another alternative could be to use the low bit to mark an in-progress
>> change, and skip the caching if the low bit is set.  Similar to a
>> seqcount (except if read_seqcount_retry fails, we just punt and not
>> retry anything), you could use it even though the memory barriers
>> provided by write_seqcount_begin/end are not too useful in this case.
> 
> I do not know how the bit works, page fault will cache the memslot before
> the bit set and cache the generation-number after the bit set.
> 
> Maybe i missed your idea, could you please detail it?

Something like this:

-	update_memslots(slots, new, kvm->memslots->generation);
+	/* ensure generation number is always increased. */
+	slots->generation = old_memslots->generation + 1;
+	update_memslots(slots, new);
	rcu_assign_pointer(kvm->memslots, slots);
	synchronize_srcu_expedited(&kvm->srcu);
+	slots->generation++;

Then case 1 and 2 will just have a cache miss.

The "low bit" is really just because each slot update does 2 generation
increases.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 33712fb..bb40df3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -96,7 +96,7 @@  static void hardware_disable_all(void);
 
 static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
 static void update_memslots(struct kvm_memslots *slots,
-			    struct kvm_memory_slot *new, u64 last_generation);
+			    struct kvm_memory_slot *new);
 
 static void kvm_release_pfn_dirty(pfn_t pfn);
 static void mark_page_dirty_in_slot(struct kvm *kvm,
@@ -687,8 +687,7 @@  static void sort_memslots(struct kvm_memslots *slots)
 }
 
 static void update_memslots(struct kvm_memslots *slots,
-			    struct kvm_memory_slot *new,
-			    u64 last_generation)
+			    struct kvm_memory_slot *new)
 {
 	if (new) {
 		int id = new->id;
@@ -699,8 +698,6 @@  static void update_memslots(struct kvm_memslots *slots,
 		if (new->npages != npages)
 			sort_memslots(slots);
 	}
-
-	slots->generation = last_generation + 1;
 }
 
 static int check_memory_region_flags(struct kvm_userspace_memory_region *mem)
@@ -722,9 +719,12 @@  static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
 {
 	struct kvm_memslots *old_memslots = kvm->memslots;
 
-	update_memslots(slots, new, kvm->memslots->generation);
+	/* ensure generation number is always increased. */
+	slots->generation = old_memslots->generation;
+	update_memslots(slots, new);
 	rcu_assign_pointer(kvm->memslots, slots);
 	synchronize_srcu_expedited(&kvm->srcu);
+	slots->generation++;
 
 	kvm_arch_memslots_updated(kvm);