diff mbox series

[7/7] x86/pv: Rewrite %dr6 handling

Message ID 20230915203628.837732-8-andrew.cooper3@citrix.com (mailing list archive)
State New, archived
Headers show
Series x86/pv: #DB vs %dr6 fixes, part 2 | expand

Commit Message

Andrew Cooper Sept. 15, 2023, 8:36 p.m. UTC
All #DB exceptions result in an update of %dr6, but this isn't handled
properly by Xen for any guest type.

Remove all ad-hoc dr6 handling, leaving it to pv_inject_event() in most cases
and using the new x86_merge_dr6() helper.

In do_debug(), swap the dr6 to pending_dbg in order to operate entirely with
positive polarity.  Among other things, this helps spot RTM/BLD in the
diagnostic message.

Drop the unconditional v->arch.dr6 adjustment.  pv_inject_event() performs the
adjustment in the common case, but retain the prior behaviour if a debugger is
attached.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
---
CC: Jan Beulich <JBeulich@suse.com>
CC: Roger Pau Monné <roger.pau@citrix.com>
CC: Wei Liu <wl@xen.org>
CC: Jinoh Kang <jinoh.kang.kr@gmail.com>

v2:
 * Split pieces out into an earlier patch.
 * Extend do_debug() to use pending_dbg entirely, rather than have the same
   variable used with different polarity at different times.
---
 xen/arch/x86/pv/emul-priv-op.c  |  5 +----
 xen/arch/x86/pv/emulate.c       |  9 +++++++--
 xen/arch/x86/pv/ro-page-fault.c |  4 ++--
 xen/arch/x86/pv/traps.c         | 17 +++++++++++++----
 xen/arch/x86/traps.c            | 23 +++++++++++------------
 5 files changed, 34 insertions(+), 24 deletions(-)

Comments

Jinoh Kang Sept. 16, 2023, 7:50 a.m. UTC | #1
On 9/16/23 05:36, Andrew Cooper wrote:
> All #DB exceptions result in an update of %dr6, but this isn't handled
> properly by Xen for any guest type.
> 
> Remove all ad-hoc dr6 handling, leaving it to pv_inject_event() in most cases
> and using the new x86_merge_dr6() helper.
> 
> In do_debug(), swap the dr6 to pending_dbg in order to operate entirely with
> positive polarity.  Among other things, this helps spot RTM/BLD in the
> diagnostic message.
> 
> Drop the unconditional v->arch.dr6 adjustment.  pv_inject_event() performs the
> adjustment in the common case, but retain the prior behaviour if a debugger is
> attached.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
> ---
> CC: Jan Beulich <JBeulich@suse.com>
> CC: Roger Pau Monné <roger.pau@citrix.com>
> CC: Wei Liu <wl@xen.org>
> CC: Jinoh Kang <jinoh.kang.kr@gmail.com>
> 
> v2:
>  * Split pieces out into an earlier patch.
>  * Extend do_debug() to use pending_dbg entirely, rather than have the same
>    variable used with different polarity at different times.
> ---
>  xen/arch/x86/pv/emul-priv-op.c  |  5 +----
>  xen/arch/x86/pv/emulate.c       |  9 +++++++--
>  xen/arch/x86/pv/ro-page-fault.c |  4 ++--
>  xen/arch/x86/pv/traps.c         | 17 +++++++++++++----
>  xen/arch/x86/traps.c            | 23 +++++++++++------------
>  5 files changed, 34 insertions(+), 24 deletions(-)
> 
> diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
> index 2f73ed0682e1..fe2011f28e34 100644
> --- a/xen/arch/x86/pv/emul-priv-op.c
> +++ b/xen/arch/x86/pv/emul-priv-op.c
> @@ -1364,10 +1364,7 @@ int pv_emulate_privileged_op(struct cpu_user_regs *regs)
>          ASSERT(!curr->arch.pv.trap_bounce.flags);
>  
>          if ( ctxt.ctxt.retire.pending_dbg )
> -        {
> -            curr->arch.dr6 |= ctxt.ctxt.retire.pending_dbg | DR_STATUS_RESERVED_ONE;
> -            pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
> -        }
> +            pv_inject_DB(ctxt.ctxt.retire.pending_dbg);
>  
>          /* fall through */
>      case X86EMUL_RETRY:
> diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c
> index e7a1c0a2cc4f..29425a0a00ec 100644
> --- a/xen/arch/x86/pv/emulate.c
> +++ b/xen/arch/x86/pv/emulate.c
> @@ -71,10 +71,15 @@ void pv_emul_instruction_done(struct cpu_user_regs *regs, unsigned long rip)
>  {
>      regs->rip = rip;
>      regs->eflags &= ~X86_EFLAGS_RF;
> +
>      if ( regs->eflags & X86_EFLAGS_TF )
>      {
> -        current->arch.dr6 |= DR_STEP | DR_STATUS_RESERVED_ONE;
> -        pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
> +        /*
> +         * TODO, this should generally use TF from the start of the
> +         * instruction.  It's only a latent bug for now, as this path isn't
> +         * used for any instruction which modifies eflags.
> +         */
> +        pv_inject_DB(X86_DR6_BS);
>      }
>  }
>  
> diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
> index cad28ef928ad..f6bb33556e72 100644
> --- a/xen/arch/x86/pv/ro-page-fault.c
> +++ b/xen/arch/x86/pv/ro-page-fault.c
> @@ -389,8 +389,8 @@ int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
>  
>          /* Fallthrough */
>      case X86EMUL_OKAY:
> -        if ( ctxt.retire.singlestep )
> -            pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
> +        if ( ctxt.retire.pending_dbg )
> +            pv_inject_DB(ctxt.retire.pending_dbg);
>  
>          /* Fallthrough */
>      case X86EMUL_RETRY:
> diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
> index 74f333da7e1c..553b04bca956 100644
> --- a/xen/arch/x86/pv/traps.c
> +++ b/xen/arch/x86/pv/traps.c
> @@ -13,6 +13,7 @@
>  #include <xen/softirq.h>
>  
>  #include <asm/pv/trace.h>
> +#include <asm/debugreg.h>
>  #include <asm/shared.h>
>  #include <asm/traps.h>
>  #include <irq_vectors.h>
> @@ -50,9 +51,9 @@ void pv_inject_event(const struct x86_event *event)
>      tb->cs    = ti->cs;
>      tb->eip   = ti->address;
>  
> -    if ( event->type == X86_EVENTTYPE_HW_EXCEPTION &&
> -         vector == X86_EXC_PF )
> +    switch ( vector | -(event->type == X86_EVENTTYPE_SW_INTERRUPT) )
>      {
> +    case X86_EXC_PF:
>          curr->arch.pv.ctrlreg[2] = event->cr2;
>          arch_set_cr2(curr, event->cr2);
>  
> @@ -62,9 +63,17 @@ void pv_inject_event(const struct x86_event *event)
>              error_code |= PFEC_user_mode;
>  
>          trace_pv_page_fault(event->cr2, error_code);
> -    }
> -    else
> +        break;
> +
> +    case X86_EXC_DB:
> +        curr->arch.dr6 = x86_merge_dr6(curr->domain->arch.cpu_policy,
> +                                       curr->arch.dr6, event->pending_dbg);
> +        /* Fallthrough */
> +
> +    default:
>          trace_pv_trap(vector, regs->rip, use_error_code, error_code);
> +        break;
> +    }
>  
>      if ( use_error_code )
>      {
> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
> index dead728ce329..447edc827b3a 100644
> --- a/xen/arch/x86/traps.c
> +++ b/xen/arch/x86/traps.c
> @@ -1887,11 +1887,11 @@ void do_device_not_available(struct cpu_user_regs *regs)
>  /* SAF-1-safe */
>  void do_debug(struct cpu_user_regs *regs)
>  {
> -    unsigned long dr6;
> +    unsigned long pending_dbg;
>      struct vcpu *v = current;
>  
> -    /* Stash dr6 as early as possible. */
> -    dr6 = read_debugreg(6);
> +    /* Stash dr6 as early as possible, operating with positive polarity. */
> +    pending_dbg = read_debugreg(6) ^ X86_DR6_DEFAULT;

We don't reset DR6 after reading it, and there is no guarantee that the PV guest
will reset it either, so it doesn't match PENDING_DBG exactly IIRC.

On the other hand, nothing will probably care about its double-accumulating
quirk except perhaps monitor users.

Do we want to document that, shadow DR6 for PV (which seems a little overkill
if we don't trap DR6 access from PV already), or just live with that?

>  
>      /*
>       * At the time of writing (March 2018), on the subject of %dr6:
> @@ -1963,13 +1963,13 @@ void do_debug(struct cpu_user_regs *regs)
>           * If however we do, safety measures need to be enacted.  Use a big
>           * hammer and clear all debug settings.
>           */
> -        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
> +        if ( pending_dbg & X86_DR6_BP_MASK )
>          {
>              unsigned int bp, dr7 = read_debugreg(7);
>  
>              for ( bp = 0; bp < 4; ++bp )
>              {
> -                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
> +                if ( (pending_dbg & (1u << bp)) && /* Breakpoint triggered? */
>                       (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
>                       ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
>                                       DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
> @@ -1990,24 +1990,23 @@ void do_debug(struct cpu_user_regs *regs)
>           * so ensure the message is ratelimited.
>           */
>          gprintk(XENLOG_WARNING,
> -                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
> +                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, pending_dbg %lx\n",
>                  regs->cs, _p(regs->rip), _p(regs->rip),
> -                regs->ss, _p(regs->rsp), dr6);
> +                regs->ss, _p(regs->rsp), pending_dbg);
>  
>          return;
>      }
>  
> -    /* Save debug status register where guest OS can peek at it */
> -    v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
> -    v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
> -
>      if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
>      {
> +        /* Save debug status register where gdbsx can peek at it */
> +        v->arch.dr6 = x86_merge_dr6(v->domain->arch.cpu_policy,
> +                                    v->arch.dr6, pending_dbg);
>          domain_pause_for_debugger();
>          return;
>      }
>  
> -    pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
> +    pv_inject_DB(pending_dbg);
>  }
>  
>  /* SAF-1-safe */
Andrew Cooper Sept. 16, 2023, 12:56 p.m. UTC | #2
On 16/09/2023 8:50 am, Jinoh Kang wrote:
> On 9/16/23 05:36, Andrew Cooper wrote:
>> diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
>> index dead728ce329..447edc827b3a 100644
>> --- a/xen/arch/x86/traps.c
>> +++ b/xen/arch/x86/traps.c
>> @@ -1887,11 +1887,11 @@ void do_device_not_available(struct cpu_user_regs *regs)
>>  /* SAF-1-safe */
>>  void do_debug(struct cpu_user_regs *regs)
>>  {
>> -    unsigned long dr6;
>> +    unsigned long pending_dbg;
>>      struct vcpu *v = current;
>>  
>> -    /* Stash dr6 as early as possible. */
>> -    dr6 = read_debugreg(6);
>> +    /* Stash dr6 as early as possible, operating with positive polarity. */
>> +    pending_dbg = read_debugreg(6) ^ X86_DR6_DEFAULT;
> We don't reset DR6 after reading it, and there is no guarantee that the PV guest
> will reset it either, so it doesn't match PENDING_DBG exactly IIRC.
>
> On the other hand, nothing will probably care about its double-accumulating
> quirk except perhaps monitor users.
>
> Do we want to document that, shadow DR6 for PV (which seems a little overkill
> if we don't trap DR6 access from PV already), or just live with that?

Different DR6's.

This is Xen responding to a real #DB (most likely from a PV guest, but
maybe from debugging activity in Xen itself), and in ...

>>  
>>      /*
>>       * At the time of writing (March 2018), on the subject of %dr6:

... this piece of context missing from the patch, Xen always writes
X86_DR6_DEFAULT back in order to clear the sticky bits.

This behaviour hasn't changed.  Xen always sees a "clean" DR6 on every
new #DB.

For the PV guest, what matters is ...

>> @@ -1963,13 +1963,13 @@ void do_debug(struct cpu_user_regs *regs)
>>           * If however we do, safety measures need to be enacted.  Use a big
>>           * hammer and clear all debug settings.
>>           */
>> -        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
>> +        if ( pending_dbg & X86_DR6_BP_MASK )
>>          {
>>              unsigned int bp, dr7 = read_debugreg(7);
>>  
>>              for ( bp = 0; bp < 4; ++bp )
>>              {
>> -                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
>> +                if ( (pending_dbg & (1u << bp)) && /* Breakpoint triggered? */
>>                       (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
>>                       ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
>>                                       DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
>> @@ -1990,24 +1990,23 @@ void do_debug(struct cpu_user_regs *regs)
>>           * so ensure the message is ratelimited.
>>           */
>>          gprintk(XENLOG_WARNING,
>> -                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
>> +                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, pending_dbg %lx\n",
>>                  regs->cs, _p(regs->rip), _p(regs->rip),
>> -                regs->ss, _p(regs->rsp), dr6);
>> +                regs->ss, _p(regs->rsp), pending_dbg);
>>  
>>          return;
>>      }
>>  
>> -    /* Save debug status register where guest OS can peek at it */
>> -    v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>> -    v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
>> -
>>      if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
>>      {
>> +        /* Save debug status register where gdbsx can peek at it */
>> +        v->arch.dr6 = x86_merge_dr6(v->domain->arch.cpu_policy,
>> +                                    v->arch.dr6, pending_dbg);
>>          domain_pause_for_debugger();
>>          return;
>>      }
>>  
>> -    pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
>> +    pv_inject_DB(pending_dbg);

... this, which will merge all new pending bits into the guest's DR6.

If the guest chooses not to clean out DR6 each time, then it will see
content accumulate.

To your earlier point of shadowing, we already have to do that.  DR6 is
just one of many registers we need to context switch for the vCPU.

PV guests, being ring-deprivieged, trap into Xen for every DR access,
and Xen handles every one of their #DB exceptions.  This is one reason
why I split the work into several parts.  PV guests are easier to get
sorted properly, and patch 1,2,5,6 are all common improvements relevant
to HVM too.

~Andrew
Jinoh Kang Sept. 16, 2023, 1:10 p.m. UTC | #3
On 9/16/23 21:56, Andrew Cooper wrote:
>> We don't reset DR6 after reading it, and there is no guarantee that the PV guest
>> will reset it either, so it doesn't match PENDING_DBG exactly IIRC.
>>
>> On the other hand, nothing will probably care about its double-accumulating
>> quirk except perhaps monitor users.
>>
>> Do we want to document that, shadow DR6 for PV (which seems a little overkill
>> if we don't trap DR6 access from PV already), or just live with that?
> 
> Different DR6's.
> 
> This is Xen responding to a real #DB (most likely from a PV guest, but
> maybe from debugging activity in Xen itself), and in ...
> 
>>>  
>>>      /*
>>>       * At the time of writing (March 2018), on the subject of %dr6:
> 
> ... this piece of context missing from the patch, Xen always writes
> X86_DR6_DEFAULT back in order to clear the sticky bits.

Oh, that'll do.

How come have I not seen this?  Maybe I was in dire need of morning coffee.
I assumed absence just from a missing context...

> 
> This behaviour hasn't changed.  Xen always sees a "clean" DR6 on every
> new #DB.
> 
> For the PV guest, what matters is ...
> 

(snip)

>>> -    /* Save debug status register where guest OS can peek at it */
>>> -    v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>>> -    v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
>>> -
>>>      if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
>>>      {
>>> +        /* Save debug status register where gdbsx can peek at it */
>>> +        v->arch.dr6 = x86_merge_dr6(v->domain->arch.cpu_policy,
>>> +                                    v->arch.dr6, pending_dbg);
>>>          domain_pause_for_debugger();
>>>          return;
>>>      }
>>>  
>>> -    pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
>>> +    pv_inject_DB(pending_dbg);
> 
> ... this, which will merge all new pending bits into the guest's DR6.
> 
> If the guest chooses not to clean out DR6 each time, then it will see
> content accumulate.
> 
> To your earlier point of shadowing, we already have to do that.  DR6 is
> just one of many registers we need to context switch for the vCPU.
> 
> PV guests, being ring-deprivieged, trap into Xen for every DR access,
> and Xen handles every one of their #DB exceptions.  This is one reason
> why I split the work into several parts.  PV guests are easier to get
> sorted properly, and patch 1,2,5,6 are all common improvements relevant
> to HVM too.

That confirms my knowledge.  Honestly if I got such a foolish remark
I would question the reviewer's understanding of PV mode (not that you did
anything wrong).  Sorry for wasting your time.

> 
> ~Andrew
Andrew Cooper Sept. 16, 2023, 1:39 p.m. UTC | #4
On 16/09/2023 2:10 pm, Jinoh Kang wrote:
> On 9/16/23 21:56, Andrew Cooper wrote:
>>>> -    /* Save debug status register where guest OS can peek at it */
>>>> -    v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
>>>> -    v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
>>>> -
>>>>      if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
>>>>      {
>>>> +        /* Save debug status register where gdbsx can peek at it */
>>>> +        v->arch.dr6 = x86_merge_dr6(v->domain->arch.cpu_policy,
>>>> +                                    v->arch.dr6, pending_dbg);
>>>>          domain_pause_for_debugger();
>>>>          return;
>>>>      }
>>>>  
>>>> -    pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
>>>> +    pv_inject_DB(pending_dbg);
>> ... this, which will merge all new pending bits into the guest's DR6.
>>
>> If the guest chooses not to clean out DR6 each time, then it will see
>> content accumulate.
>>
>> To your earlier point of shadowing, we already have to do that.  DR6 is
>> just one of many registers we need to context switch for the vCPU.
>>
>> PV guests, being ring-deprivieged, trap into Xen for every DR access,
>> and Xen handles every one of their #DB exceptions.  This is one reason
>> why I split the work into several parts.  PV guests are easier to get
>> sorted properly, and patch 1,2,5,6 are all common improvements relevant
>> to HVM too.
> That confirms my knowledge.  Honestly if I got such a foolish remark
> I would question the reviewer's understanding of PV mode (not that you did
> anything wrong).  Sorry for wasting your time.

Not at all.

This is exceptionally tricky, and as you can see from the v1 series,
don't assume that I know it all perfectly.

The specifics of PV guests are unique to Xen, and there are 0 people who
understand it fully.  For example, doing the sanity testing for this v2
series, I rediscovered the completely undocumented properly that a PV
guests IOPL is separate from an architectural ideal of IOPL, defaults to
0 which has the meaning "guest kernel takes a #GP for access to IO ports".

Please do continue to ask questions like this if you're not sure.  It is
not a waste of time cross-checking that the logic is correct.

~Andrew
Jan Beulich Sept. 18, 2023, 11:46 a.m. UTC | #5
On 15.09.2023 22:36, Andrew Cooper wrote:
> All #DB exceptions result in an update of %dr6, but this isn't handled
> properly by Xen for any guest type.
> 
> Remove all ad-hoc dr6 handling, leaving it to pv_inject_event() in most cases
> and using the new x86_merge_dr6() helper.
> 
> In do_debug(), swap the dr6 to pending_dbg in order to operate entirely with
> positive polarity.  Among other things, this helps spot RTM/BLD in the
> diagnostic message.
> 
> Drop the unconditional v->arch.dr6 adjustment.  pv_inject_event() performs the
> adjustment in the common case, but retain the prior behaviour if a debugger is
> attached.
> 
> Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>

Reviewed-by: Jan Beulich <jbeulich@suse.com>

One minor aspect to consider:

> @@ -1990,24 +1990,23 @@ void do_debug(struct cpu_user_regs *regs)
>           * so ensure the message is ratelimited.
>           */
>          gprintk(XENLOG_WARNING,
> -                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
> +                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, pending_dbg %lx\n",

Would you mind shorting to just "pending"?

Jan
diff mbox series

Patch

diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index 2f73ed0682e1..fe2011f28e34 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -1364,10 +1364,7 @@  int pv_emulate_privileged_op(struct cpu_user_regs *regs)
         ASSERT(!curr->arch.pv.trap_bounce.flags);
 
         if ( ctxt.ctxt.retire.pending_dbg )
-        {
-            curr->arch.dr6 |= ctxt.ctxt.retire.pending_dbg | DR_STATUS_RESERVED_ONE;
-            pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
-        }
+            pv_inject_DB(ctxt.ctxt.retire.pending_dbg);
 
         /* fall through */
     case X86EMUL_RETRY:
diff --git a/xen/arch/x86/pv/emulate.c b/xen/arch/x86/pv/emulate.c
index e7a1c0a2cc4f..29425a0a00ec 100644
--- a/xen/arch/x86/pv/emulate.c
+++ b/xen/arch/x86/pv/emulate.c
@@ -71,10 +71,15 @@  void pv_emul_instruction_done(struct cpu_user_regs *regs, unsigned long rip)
 {
     regs->rip = rip;
     regs->eflags &= ~X86_EFLAGS_RF;
+
     if ( regs->eflags & X86_EFLAGS_TF )
     {
-        current->arch.dr6 |= DR_STEP | DR_STATUS_RESERVED_ONE;
-        pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+        /*
+         * TODO, this should generally use TF from the start of the
+         * instruction.  It's only a latent bug for now, as this path isn't
+         * used for any instruction which modifies eflags.
+         */
+        pv_inject_DB(X86_DR6_BS);
     }
 }
 
diff --git a/xen/arch/x86/pv/ro-page-fault.c b/xen/arch/x86/pv/ro-page-fault.c
index cad28ef928ad..f6bb33556e72 100644
--- a/xen/arch/x86/pv/ro-page-fault.c
+++ b/xen/arch/x86/pv/ro-page-fault.c
@@ -389,8 +389,8 @@  int pv_ro_page_fault(unsigned long addr, struct cpu_user_regs *regs)
 
         /* Fallthrough */
     case X86EMUL_OKAY:
-        if ( ctxt.retire.singlestep )
-            pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+        if ( ctxt.retire.pending_dbg )
+            pv_inject_DB(ctxt.retire.pending_dbg);
 
         /* Fallthrough */
     case X86EMUL_RETRY:
diff --git a/xen/arch/x86/pv/traps.c b/xen/arch/x86/pv/traps.c
index 74f333da7e1c..553b04bca956 100644
--- a/xen/arch/x86/pv/traps.c
+++ b/xen/arch/x86/pv/traps.c
@@ -13,6 +13,7 @@ 
 #include <xen/softirq.h>
 
 #include <asm/pv/trace.h>
+#include <asm/debugreg.h>
 #include <asm/shared.h>
 #include <asm/traps.h>
 #include <irq_vectors.h>
@@ -50,9 +51,9 @@  void pv_inject_event(const struct x86_event *event)
     tb->cs    = ti->cs;
     tb->eip   = ti->address;
 
-    if ( event->type == X86_EVENTTYPE_HW_EXCEPTION &&
-         vector == X86_EXC_PF )
+    switch ( vector | -(event->type == X86_EVENTTYPE_SW_INTERRUPT) )
     {
+    case X86_EXC_PF:
         curr->arch.pv.ctrlreg[2] = event->cr2;
         arch_set_cr2(curr, event->cr2);
 
@@ -62,9 +63,17 @@  void pv_inject_event(const struct x86_event *event)
             error_code |= PFEC_user_mode;
 
         trace_pv_page_fault(event->cr2, error_code);
-    }
-    else
+        break;
+
+    case X86_EXC_DB:
+        curr->arch.dr6 = x86_merge_dr6(curr->domain->arch.cpu_policy,
+                                       curr->arch.dr6, event->pending_dbg);
+        /* Fallthrough */
+
+    default:
         trace_pv_trap(vector, regs->rip, use_error_code, error_code);
+        break;
+    }
 
     if ( use_error_code )
     {
diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c
index dead728ce329..447edc827b3a 100644
--- a/xen/arch/x86/traps.c
+++ b/xen/arch/x86/traps.c
@@ -1887,11 +1887,11 @@  void do_device_not_available(struct cpu_user_regs *regs)
 /* SAF-1-safe */
 void do_debug(struct cpu_user_regs *regs)
 {
-    unsigned long dr6;
+    unsigned long pending_dbg;
     struct vcpu *v = current;
 
-    /* Stash dr6 as early as possible. */
-    dr6 = read_debugreg(6);
+    /* Stash dr6 as early as possible, operating with positive polarity. */
+    pending_dbg = read_debugreg(6) ^ X86_DR6_DEFAULT;
 
     /*
      * At the time of writing (March 2018), on the subject of %dr6:
@@ -1963,13 +1963,13 @@  void do_debug(struct cpu_user_regs *regs)
          * If however we do, safety measures need to be enacted.  Use a big
          * hammer and clear all debug settings.
          */
-        if ( dr6 & (DR_TRAP3 | DR_TRAP2 | DR_TRAP1 | DR_TRAP0) )
+        if ( pending_dbg & X86_DR6_BP_MASK )
         {
             unsigned int bp, dr7 = read_debugreg(7);
 
             for ( bp = 0; bp < 4; ++bp )
             {
-                if ( (dr6 & (1u << bp)) && /* Breakpoint triggered? */
+                if ( (pending_dbg & (1u << bp)) && /* Breakpoint triggered? */
                      (dr7 & (3u << (bp * DR_ENABLE_SIZE))) && /* Enabled? */
                      ((dr7 & (3u << ((bp * DR_CONTROL_SIZE) + /* Insn? */
                                      DR_CONTROL_SHIFT))) == DR_RW_EXECUTE) )
@@ -1990,24 +1990,23 @@  void do_debug(struct cpu_user_regs *regs)
          * so ensure the message is ratelimited.
          */
         gprintk(XENLOG_WARNING,
-                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, dr6 %lx\n",
+                "Hit #DB in Xen context: %04x:%p [%ps], stk %04x:%p, pending_dbg %lx\n",
                 regs->cs, _p(regs->rip), _p(regs->rip),
-                regs->ss, _p(regs->rsp), dr6);
+                regs->ss, _p(regs->rsp), pending_dbg);
 
         return;
     }
 
-    /* Save debug status register where guest OS can peek at it */
-    v->arch.dr6 |= (dr6 & ~X86_DR6_DEFAULT);
-    v->arch.dr6 &= (dr6 | ~X86_DR6_DEFAULT);
-
     if ( guest_kernel_mode(v, regs) && v->domain->debugger_attached )
     {
+        /* Save debug status register where gdbsx can peek at it */
+        v->arch.dr6 = x86_merge_dr6(v->domain->arch.cpu_policy,
+                                    v->arch.dr6, pending_dbg);
         domain_pause_for_debugger();
         return;
     }
 
-    pv_inject_hw_exception(X86_EXC_DB, X86_EVENT_NO_EC);
+    pv_inject_DB(pending_dbg);
 }
 
 /* SAF-1-safe */