diff mbox

[1/3] x86/HVM: fix forwarding of internally cached requests

Message ID 56F3DD8802000078000E007F@prv-mh.provo.novell.com (mailing list archive)
State New, archived
Headers show

Commit Message

Jan Beulich March 24, 2016, 11:28 a.m. UTC
Forwarding entire batches to the device model when an individual
iteration of them got rejected by internal device emulation handlers
with X86EMUL_UNHANDLEABLE is wrong: The device model would then handle
all iterations, without the internal handler getting to see any past
the one it returned failure for. This causes misbehavior in at least
the MSI-X and VGA code, which want to see all such requests for
internal tracking/caching purposes. But note that this does not apply
to buffered I/O requests.

This in turn means that the condition in hvm_process_io_intercept() of
when to crash the domain was wrong: Since X86EMUL_UNHANDLEABLE can
validly be returned by the individual device handlers, we mustn't
blindly crash the domain if such occurs on other than the initial
iteration. Instead we need to distinguish hvm_copy_*_guest_phys()
failures from device specific ones, and then the former need to always
be fatal to the domain (i.e. also on the first iteration), since
otherwise we again would end up forwarding a request to qemu which the
internal handler didn't get to see.

Also commit 4faffc41d ("x86/hvm: limit reps to avoid the need to handle
retry") went too far in removing code from hvm_process_io_intercept():
When there were successfully handled iterations, the function should
continue to return success with a clipped repeat count.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Chang Jianzhong <changjzh@gmail.com>
---
I assume this also addresses the issue which
http://lists.xenproject.org/archives/html/xen-devel/2016-01/msg03189.html 
attempted to deal with in a not really acceptable way.
x86/HVM: fix forwarding of internally cached requests

Forwarding entire batches to the device model when an individual
iteration of them got rejected by internal device emulation handlers
with X86EMUL_UNHANDLEABLE is wrong: The device model would then handle
all iterations, without the internal handler getting to see any past
the one it returned failure for. This causes misbehavior in at least
the MSI-X and VGA code, which want to see all such requests for
internal tracking/caching purposes. But note that this does not apply
to buffered I/O requests.

This in turn means that the condition in hvm_process_io_intercept() of
when to crash the domain was wrong: Since X86EMUL_UNHANDLEABLE can
validly be returned by the individual device handlers, we mustn't
blindly crash the domain if such occurs on other than the initial
iteration. Instead we need to distinguish hvm_copy_*_guest_phys()
failures from device specific ones, and then the former need to always
be fatal to the domain (i.e. also on the first iteration), since
otherwise we again would end up forwarding a request to qemu which the
internal handler didn't get to see.

Also commit 4faffc41d ("x86/hvm: limit reps to avoid the need to handle
retry") went too far in removing code from hvm_process_io_intercept():
When there were successfully handled iterations, the function should
continue to return success with a clipped repeat count.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Chang Jianzhong <changjzh@gmail.com>
---
I assume this also addresses the issue which
http://lists.xenproject.org/archives/html/xen-devel/2016-01/msg03189.html
attempted to deal with in a not really acceptable way.

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -95,7 +95,7 @@ static const struct hvm_io_handler null_
 };
 
 static int hvmemul_do_io(
-    bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
+    bool_t is_mmio, paddr_t addr, unsigned long *reps, unsigned int size,
     uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
 {
     struct vcpu *curr = current;
@@ -104,7 +104,7 @@ static int hvmemul_do_io(
         .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
         .addr = addr,
         .size = size,
-        .count = reps,
+        .count = *reps,
         .dir = dir,
         .df = df,
         .data = data,
@@ -136,7 +136,7 @@ static int hvmemul_do_io(
         if ( (p.type != is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO) ||
              (p.addr != addr) ||
              (p.size != size) ||
-             (p.count != reps) ||
+             (p.count != *reps) ||
              (p.dir != dir) ||
              (p.df != df) ||
              (p.data_is_ptr != data_is_addr) )
@@ -214,7 +214,7 @@ static int hvmemul_do_io_buffer(
 
     BUG_ON(buffer == NULL);
 
-    rc = hvmemul_do_io(is_mmio, addr, *reps, size, dir, df, 0,
+    rc = hvmemul_do_io(is_mmio, addr, reps, size, dir, df, 0,
                        (uintptr_t)buffer);
     if ( rc == X86EMUL_UNHANDLEABLE && dir == IOREQ_READ )
         memset(buffer, 0xff, size);
@@ -305,13 +305,13 @@ static int hvmemul_do_io_addr(
         count = 1;
     }
 
-    rc = hvmemul_do_io(is_mmio, addr, count, size, dir, df, 1,
+    rc = hvmemul_do_io(is_mmio, addr, &count, size, dir, df, 1,
                        ram_gpa);
+
     if ( rc == X86EMUL_OKAY )
-    {
         v->arch.hvm_vcpu.hvm_io.mmio_retry = (count < *reps);
-        *reps = count;
-    }
+
+    *reps = count;
 
  out:
     while ( nr_pages )
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -148,7 +148,7 @@ int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    rc = X86EMUL_UNHANDLEABLE;
+                    rc = -1; /* != any X86EMUL_* value. */
                     break;
                 }
                 if ( rc != X86EMUL_OKAY )
@@ -178,7 +178,7 @@ int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    rc = X86EMUL_UNHANDLEABLE;
+                    rc = -1; /* != any X86EMUL_* value. */
                     break;
                 }
                 if ( rc != X86EMUL_OKAY )
@@ -196,8 +196,22 @@ int hvm_process_io_intercept(const struc
         }
     }
 
-    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
+    if ( unlikely(rc < 0) )
         domain_crash(current->domain);
+    else if ( i )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+    else if ( rc == X86EMUL_UNHANDLEABLE )
+    {
+        /*
+         * Don't forward entire batches to the device model: This would
+         * prevent the internal handlers to see subsequent iterations of
+         * the request.
+         */
+        p->count = 1;
+    }
 
     return rc;
 }

Comments

Paul Durrant March 24, 2016, 11:49 a.m. UTC | #1
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 24 March 2016 11:29
> To: xen-devel
> Cc: Andrew Cooper; Paul Durrant; Chang Jianzhong; Keir (Xen.org)
> Subject: [PATCH 1/3] x86/HVM: fix forwarding of internally cached requests
> 
> Forwarding entire batches to the device model when an individual
> iteration of them got rejected by internal device emulation handlers
> with X86EMUL_UNHANDLEABLE is wrong: The device model would then
> handle
> all iterations, without the internal handler getting to see any past
> the one it returned failure for. This causes misbehavior in at least
> the MSI-X and VGA code, which want to see all such requests for
> internal tracking/caching purposes. But note that this does not apply
> to buffered I/O requests.
> 
> This in turn means that the condition in hvm_process_io_intercept() of
> when to crash the domain was wrong: Since X86EMUL_UNHANDLEABLE can
> validly be returned by the individual device handlers, we mustn't
> blindly crash the domain if such occurs on other than the initial
> iteration. Instead we need to distinguish hvm_copy_*_guest_phys()
> failures from device specific ones, and then the former need to always
> be fatal to the domain (i.e. also on the first iteration), since
> otherwise we again would end up forwarding a request to qemu which the
> internal handler didn't get to see.
> 
> Also commit 4faffc41d ("x86/hvm: limit reps to avoid the need to handle
> retry") went too far in removing code from hvm_process_io_intercept():
> When there were successfully handled iterations, the function should
> continue to return success with a clipped repeat count.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Cc: Chang Jianzhong <changjzh@gmail.com>
> ---
> I assume this also addresses the issue which
> http://lists.xenproject.org/archives/html/xen-devel/2016-01/msg03189.html
> attempted to deal with in a not really acceptable way.
> 
> --- a/xen/arch/x86/hvm/emulate.c
> +++ b/xen/arch/x86/hvm/emulate.c
> @@ -95,7 +95,7 @@ static const struct hvm_io_handler null_
>  };
> 
>  static int hvmemul_do_io(
> -    bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
> +    bool_t is_mmio, paddr_t addr, unsigned long *reps, unsigned int size,
>      uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
>  {
>      struct vcpu *curr = current;
> @@ -104,7 +104,7 @@ static int hvmemul_do_io(
>          .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
>          .addr = addr,
>          .size = size,
> -        .count = reps,
> +        .count = *reps,
>          .dir = dir,
>          .df = df,
>          .data = data,
> @@ -136,7 +136,7 @@ static int hvmemul_do_io(
>          if ( (p.type != is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO) ||
>               (p.addr != addr) ||
>               (p.size != size) ||
> -             (p.count != reps) ||
> +             (p.count != *reps) ||
>               (p.dir != dir) ||
>               (p.df != df) ||
>               (p.data_is_ptr != data_is_addr) )
> @@ -214,7 +214,7 @@ static int hvmemul_do_io_buffer(
> 
>      BUG_ON(buffer == NULL);
> 
> -    rc = hvmemul_do_io(is_mmio, addr, *reps, size, dir, df, 0,
> +    rc = hvmemul_do_io(is_mmio, addr, reps, size, dir, df, 0,
>                         (uintptr_t)buffer);
>      if ( rc == X86EMUL_UNHANDLEABLE && dir == IOREQ_READ )
>          memset(buffer, 0xff, size);
> @@ -305,13 +305,13 @@ static int hvmemul_do_io_addr(
>          count = 1;
>      }
> 
> -    rc = hvmemul_do_io(is_mmio, addr, count, size, dir, df, 1,
> +    rc = hvmemul_do_io(is_mmio, addr, &count, size, dir, df, 1,
>                         ram_gpa);
> +
>      if ( rc == X86EMUL_OKAY )
> -    {
>          v->arch.hvm_vcpu.hvm_io.mmio_retry = (count < *reps);
> -        *reps = count;
> -    }
> +
> +    *reps = count;
> 
>   out:
>      while ( nr_pages )
> --- a/xen/arch/x86/hvm/intercept.c
> +++ b/xen/arch/x86/hvm/intercept.c
> @@ -148,7 +148,7 @@ int hvm_process_io_intercept(const struc
>                      ASSERT_UNREACHABLE();
>                      /* fall through */
>                  default:
> -                    rc = X86EMUL_UNHANDLEABLE;
> +                    rc = -1; /* != any X86EMUL_* value. */

Rather than the need for magic values, couldn't you just goto a domain_crash label at the tail of the function?
Also, since domain_crash() isn't synchronous, I think you should replace the -1 value with some valid X86EMUL_ value before returning from the function.

>                      break;
>                  }
>                  if ( rc != X86EMUL_OKAY )
> @@ -178,7 +178,7 @@ int hvm_process_io_intercept(const struc
>                      ASSERT_UNREACHABLE();
>                      /* fall through */
>                  default:
> -                    rc = X86EMUL_UNHANDLEABLE;
> +                    rc = -1; /* != any X86EMUL_* value. */
>                      break;
>                  }
>                  if ( rc != X86EMUL_OKAY )
> @@ -196,8 +196,22 @@ int hvm_process_io_intercept(const struc
>          }
>      }
> 
> -    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
> +    if ( unlikely(rc < 0) )
>          domain_crash(current->domain);
> +    else if ( i )
> +    {
> +        p->count = i;
> +        rc = X86EMUL_OKAY;
> +    }
> +    else if ( rc == X86EMUL_UNHANDLEABLE )
> +    {
> +        /*
> +         * Don't forward entire batches to the device model: This would
> +         * prevent the internal handlers to see subsequent iterations of
> +         * the request.
> +         */
> +        p->count = 1;

I guess this is ok. If stdvga is not caching then the accept function would have failed so you won't get here, and if it send the buffered ioreq then you still don't get here because it returns X86EMUL_OKAY.

  Paul

> +    }
> 
>      return rc;
>  }
>
Jan Beulich March 24, 2016, 12:01 p.m. UTC | #2
>>> On 24.03.16 at 12:49, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 24 March 2016 11:29
>> --- a/xen/arch/x86/hvm/intercept.c
>> +++ b/xen/arch/x86/hvm/intercept.c
>> @@ -148,7 +148,7 @@ int hvm_process_io_intercept(const struc
>>                      ASSERT_UNREACHABLE();
>>                      /* fall through */
>>                  default:
>> -                    rc = X86EMUL_UNHANDLEABLE;
>> +                    rc = -1; /* != any X86EMUL_* value. */
> 
> Rather than the need for magic values, couldn't you just goto a domain_crash 
> label at the tail of the function?
> Also, since domain_crash() isn't synchronous, I think you should replace the 
> -1 value with some valid X86EMUL_ value before returning from the function.

Good point, but I guess I'd rather move the domain_crash() right here
and retain the X86EMUL_UNHANDLEABLE then. What do you think?

Jan
Paul Durrant March 24, 2016, 12:11 p.m. UTC | #3
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 24 March 2016 12:02
> To: Paul Durrant
> Cc: Andrew Cooper; Chang Jianzhong; xen-devel; Keir (Xen.org)
> Subject: RE: [PATCH 1/3] x86/HVM: fix forwarding of internally cached
> requests
> 
> >>> On 24.03.16 at 12:49, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 24 March 2016 11:29
> >> --- a/xen/arch/x86/hvm/intercept.c
> >> +++ b/xen/arch/x86/hvm/intercept.c
> >> @@ -148,7 +148,7 @@ int hvm_process_io_intercept(const struc
> >>                      ASSERT_UNREACHABLE();
> >>                      /* fall through */
> >>                  default:
> >> -                    rc = X86EMUL_UNHANDLEABLE;
> >> +                    rc = -1; /* != any X86EMUL_* value. */
> >
> > Rather than the need for magic values, couldn't you just goto a
> domain_crash
> > label at the tail of the function?
> > Also, since domain_crash() isn't synchronous, I think you should replace the
> > -1 value with some valid X86EMUL_ value before returning from the
> function.
> 
> Good point, but I guess I'd rather move the domain_crash() right here
> and retain the X86EMUL_UNHANDLEABLE then. What do you think?
> 

Yes, that's probably better since IIRC domain_crash()  will spit out the line number it's invoked from.

  Paul

> Jan
Jan Beulich March 24, 2016, 12:52 p.m. UTC | #4
>>> On 24.03.16 at 12:49, <Paul.Durrant@citrix.com> wrote:
>> From: Jan Beulich [mailto:JBeulich@suse.com]
>> Sent: 24 March 2016 11:29
>> @@ -196,8 +196,22 @@ int hvm_process_io_intercept(const struc
>>          }
>>      }
>> 
>> -    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
>> +    if ( unlikely(rc < 0) )
>>          domain_crash(current->domain);
>> +    else if ( i )
>> +    {
>> +        p->count = i;
>> +        rc = X86EMUL_OKAY;
>> +    }
>> +    else if ( rc == X86EMUL_UNHANDLEABLE )
>> +    {
>> +        /*
>> +         * Don't forward entire batches to the device model: This would
>> +         * prevent the internal handlers to see subsequent iterations of
>> +         * the request.
>> +         */
>> +        p->count = 1;
> 
> I guess this is ok. If stdvga is not caching then the accept function would 
> have failed so you won't get here, and if it send the buffered ioreq then you 
> still don't get here because it returns X86EMUL_OKAY.

Good that you thought of this - I had forgotten that stdvga's
MMIO handling now takes this same code path rather than a
fully separate one. I guess I'll steal some of the wording above
for the v2 commit message.

Jan
Paul Durrant March 24, 2016, 12:58 p.m. UTC | #5
> -----Original Message-----
> From: Jan Beulich [mailto:JBeulich@suse.com]
> Sent: 24 March 2016 12:52
> To: Paul Durrant
> Cc: Andrew Cooper; Chang Jianzhong; xen-devel; Keir (Xen.org)
> Subject: RE: [PATCH 1/3] x86/HVM: fix forwarding of internally cached
> requests
> 
> >>> On 24.03.16 at 12:49, <Paul.Durrant@citrix.com> wrote:
> >> From: Jan Beulich [mailto:JBeulich@suse.com]
> >> Sent: 24 March 2016 11:29
> >> @@ -196,8 +196,22 @@ int hvm_process_io_intercept(const struc
> >>          }
> >>      }
> >>
> >> -    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
> >> +    if ( unlikely(rc < 0) )
> >>          domain_crash(current->domain);
> >> +    else if ( i )
> >> +    {
> >> +        p->count = i;
> >> +        rc = X86EMUL_OKAY;
> >> +    }
> >> +    else if ( rc == X86EMUL_UNHANDLEABLE )
> >> +    {
> >> +        /*
> >> +         * Don't forward entire batches to the device model: This would
> >> +         * prevent the internal handlers to see subsequent iterations of
> >> +         * the request.
> >> +         */
> >> +        p->count = 1;
> >
> > I guess this is ok. If stdvga is not caching then the accept function would
> > have failed so you won't get here, and if it send the buffered ioreq then
> you
> > still don't get here because it returns X86EMUL_OKAY.
> 
> Good that you thought of this - I had forgotten that stdvga's
> MMIO handling now takes this same code path rather than a
> fully separate one.

Yes, that's why I was a little worried before I saw the code :-)

> I guess I'll steal some of the wording above
> for the v2 commit message.
> 

Sure.

  Paul

> Jan
diff mbox

Patch

--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -95,7 +95,7 @@  static const struct hvm_io_handler null_
 };
 
 static int hvmemul_do_io(
-    bool_t is_mmio, paddr_t addr, unsigned long reps, unsigned int size,
+    bool_t is_mmio, paddr_t addr, unsigned long *reps, unsigned int size,
     uint8_t dir, bool_t df, bool_t data_is_addr, uintptr_t data)
 {
     struct vcpu *curr = current;
@@ -104,7 +104,7 @@  static int hvmemul_do_io(
         .type = is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO,
         .addr = addr,
         .size = size,
-        .count = reps,
+        .count = *reps,
         .dir = dir,
         .df = df,
         .data = data,
@@ -136,7 +136,7 @@  static int hvmemul_do_io(
         if ( (p.type != is_mmio ? IOREQ_TYPE_COPY : IOREQ_TYPE_PIO) ||
              (p.addr != addr) ||
              (p.size != size) ||
-             (p.count != reps) ||
+             (p.count != *reps) ||
              (p.dir != dir) ||
              (p.df != df) ||
              (p.data_is_ptr != data_is_addr) )
@@ -214,7 +214,7 @@  static int hvmemul_do_io_buffer(
 
     BUG_ON(buffer == NULL);
 
-    rc = hvmemul_do_io(is_mmio, addr, *reps, size, dir, df, 0,
+    rc = hvmemul_do_io(is_mmio, addr, reps, size, dir, df, 0,
                        (uintptr_t)buffer);
     if ( rc == X86EMUL_UNHANDLEABLE && dir == IOREQ_READ )
         memset(buffer, 0xff, size);
@@ -305,13 +305,13 @@  static int hvmemul_do_io_addr(
         count = 1;
     }
 
-    rc = hvmemul_do_io(is_mmio, addr, count, size, dir, df, 1,
+    rc = hvmemul_do_io(is_mmio, addr, &count, size, dir, df, 1,
                        ram_gpa);
+
     if ( rc == X86EMUL_OKAY )
-    {
         v->arch.hvm_vcpu.hvm_io.mmio_retry = (count < *reps);
-        *reps = count;
-    }
+
+    *reps = count;
 
  out:
     while ( nr_pages )
--- a/xen/arch/x86/hvm/intercept.c
+++ b/xen/arch/x86/hvm/intercept.c
@@ -148,7 +148,7 @@  int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    rc = X86EMUL_UNHANDLEABLE;
+                    rc = -1; /* != any X86EMUL_* value. */
                     break;
                 }
                 if ( rc != X86EMUL_OKAY )
@@ -178,7 +178,7 @@  int hvm_process_io_intercept(const struc
                     ASSERT_UNREACHABLE();
                     /* fall through */
                 default:
-                    rc = X86EMUL_UNHANDLEABLE;
+                    rc = -1; /* != any X86EMUL_* value. */
                     break;
                 }
                 if ( rc != X86EMUL_OKAY )
@@ -196,8 +196,22 @@  int hvm_process_io_intercept(const struc
         }
     }
 
-    if ( i != 0 && rc == X86EMUL_UNHANDLEABLE )
+    if ( unlikely(rc < 0) )
         domain_crash(current->domain);
+    else if ( i )
+    {
+        p->count = i;
+        rc = X86EMUL_OKAY;
+    }
+    else if ( rc == X86EMUL_UNHANDLEABLE )
+    {
+        /*
+         * Don't forward entire batches to the device model: This would
+         * prevent the internal handlers to see subsequent iterations of
+         * the request.
+         */
+        p->count = 1;
+    }
 
     return rc;
 }