diff mbox series

[v8,07/12] x86emul: support FNSTENV and FNSAVE

Message ID 9a2afbb1-af92-2c7d-9fde-d8d8e4563a2a@suse.com (mailing list archive)
State Superseded
Headers show
Series x86emul: further work | expand

Commit Message

Jan Beulich May 5, 2020, 8:15 a.m. UTC
To avoid introducing another boolean into emulator state, the
rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
info (affecting structure layout, albeit not size) to x86_emul_blk().

Signed-off-by: Jan Beulich <jbeulich@suse.com>
---
TBD: The full 16-bit padding fields in the 32-bit structures get filled
     with all ones by modern CPUs (i.e. other than the comment says for
     FIP and FDP). We may want to mirror this as well (for the real mode
     variant), even if those fields' contents are unspecified.
---
v7: New.

Comments

Jan Beulich May 5, 2020, 12:36 p.m. UTC | #1
On 05.05.2020 10:15, Jan Beulich wrote:
> @@ -11542,6 +11611,12 @@ int x86_emul_blk(
>      switch ( state->blk )
>      {
>          bool zf;
> +        struct {
> +            struct x87_env32 env;
> +            struct {
> +               uint8_t bytes[10];
> +            } freg[8];
> +        } fpstate;

This also needs #ifndef X86EMUL_NO_FPU around it for !HVM builds
to work.

Jan
Andrew Cooper May 8, 2020, 5:58 p.m. UTC | #2
On 05/05/2020 09:15, Jan Beulich wrote:
> To avoid introducing another boolean into emulator state, the
> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
> info (affecting structure layout, albeit not size) to x86_emul_blk().
>
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> ---
> TBD: The full 16-bit padding fields in the 32-bit structures get filled
>      with all ones by modern CPUs (i.e. other than the comment says for

You really mean "unlike" here, rather than "other".  They do not have
the same meaning in this context.

(I think you're also missing a "what", which I'm guessing is just an
oversight.)

>      FIP and FDP). We may want to mirror this as well (for the real mode
>      variant), even if those fields' contents are unspecified.

This is surprising behaviour, but I expect it dates back to external x87
processors and default MMIO behaviour.

If it is entirely consistent, it match be nice to match.  OTOH, the
manuals are very clear that it is reserved, which I think gives us the
liberty to use the easier implementation.

> ---
> v7: New.
>
> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>  #endif
>  
> +#ifndef X86EMUL_NO_FPU
> +struct x87_env16 {
> +    uint16_t fcw;
> +    uint16_t fsw;
> +    uint16_t ftw;
> +    union {
> +        struct {
> +            uint16_t fip_lo;
> +            uint16_t fop:11, :1, fip_hi:4;
> +            uint16_t fdp_lo;
> +            uint16_t :12, fdp_hi:4;
> +        } real;
> +        struct {
> +            uint16_t fip;
> +            uint16_t fcs;
> +            uint16_t fdp;
> +            uint16_t fds;
> +        } prot;
> +    } mode;
> +};
> +
> +struct x87_env32 {
> +    uint32_t fcw:16, :16;
> +    uint32_t fsw:16, :16;
> +    uint32_t ftw:16, :16;

uint16_t fcw, :16;
uint16_t fsw, :16;
uint16_t ftw, :16;

which reduces the number of 16 bit bitfields.

> +    union {
> +        struct {
> +            /* some CPUs/FPUs also store the full FIP here */
> +            uint32_t fip_lo:16, :16;
> +            uint32_t fop:11, :1, fip_hi:16, :4;
> +            /* some CPUs/FPUs also store the full FDP here */
> +            uint32_t fdp_lo:16, :16;
> +            uint32_t :12, fdp_hi:16, :4;

Annoyingly, two of these lines can't use uint16_t.  I'm torn as to
whether to suggest converting the other two which can.

> +        } real;
> +        struct {
> +            uint32_t fip;
> +            uint32_t fcs:16, fop:11, :5;
> +            uint32_t fdp;
> +            uint32_t fds:16, :16;

These two can be converted safely.

> @@ -4912,9 +4959,19 @@ x86_emulate(
>                      goto done;
>                  emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
>                  break;
> -            case 6: /* fnstenv - TODO */
> +            case 6: /* fnstenv */
> +                fail_if(!ops->blk);
> +                state->blk = blk_fst;
> +                /* REX is meaningless for this insn by this point. */
> +                rex_prefix = in_protmode(ctxt, ops);

Code like this is why I have such a strong objection to obfuscating macros.

It reads as if you're updating a local variable, alongside a comment
explaining that it is meaningless.

It is critically important for clarity that the comment state that
you're (ab)using the field to pass information into ->blk(), and I'd go
so far as suggesting

/*state->*/rex_prefix = in_protmode(ctxt, ops);

to reinforce the point that rex_prefix isn't a local variable, seeing
the obfuscation prevents a real state->rex_prefix from working.

> +                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
> +                                    op_bytes > 2 ? sizeof(struct x87_env32)
> +                                                 : sizeof(struct x87_env16),
> +                                    &_regs.eflags,
> +                                    state, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
>                  state->fpu_ctrl = true;
> -                goto unimplemented_insn;
> +                break;
>              case 7: /* fnstcw m2byte */
>                  state->fpu_ctrl = true;
>              fpu_memdst16:
> @@ -5068,9 +5125,21 @@ x86_emulate(
>                  emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
>                  break;
>              case 4: /* frstor - TODO */
> -            case 6: /* fnsave - TODO */
>                  state->fpu_ctrl = true;
>                  goto unimplemented_insn;
> +            case 6: /* fnsave */
> +                fail_if(!ops->blk);
> +                state->blk = blk_fst;
> +                /* REX is meaningless for this insn by this point. */
> +                rex_prefix = in_protmode(ctxt, ops);
> +                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
> +                                    op_bytes > 2 ? sizeof(struct x87_env32) + 80
> +                                                 : sizeof(struct x87_env16) + 80,
> +                                    &_regs.eflags,
> +                                    state, ctxt)) != X86EMUL_OKAY )
> +                    goto done;
> +                state->fpu_ctrl = true;
> +                break;
>              case 7: /* fnstsw m2byte */
>                  state->fpu_ctrl = true;
>                  goto fpu_memdst16;
> @@ -11542,6 +11611,12 @@ int x86_emul_blk(
>      switch ( state->blk )
>      {
>          bool zf;
> +        struct {
> +            struct x87_env32 env;
> +            struct {
> +               uint8_t bytes[10];
> +            } freg[8];
> +        } fpstate;
>  
>          /*
>           * Throughout this switch(), memory clobbers are used to compensate
> @@ -11571,6 +11646,93 @@ int x86_emul_blk(
>              *eflags |= X86_EFLAGS_ZF;
>          break;
>  
> +#ifndef X86EMUL_NO_FPU
> +
> +    case blk_fst:
> +        ASSERT(!data);
> +
> +        if ( bytes > sizeof(fpstate.env) )
> +            asm ( "fnsave %0" : "=m" (fpstate) );
> +        else
> +            asm ( "fnstenv %0" : "=m" (fpstate.env) );

We have 4 possible sizes to deal with here - the 16 and 32bit formats of
prot vs real/vm86 modes, and it is not clear (code wise) why
sizeof(fpstate.env) is a suitable boundary.

Given that these are legacy instructions and not a hotpath in the
slightest, it is possibly faster (by removing the branch) and definitely
more obvious to use fnsave unconditionally, and derive all of the
smaller layouts that way.

Critically however, it prevents us from needing a CVE/XSA if ... [bottom
comment]

> +
> +        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
> +        switch ( bytes )
> +        {
> +        case sizeof(fpstate.env):
> +        case sizeof(fpstate):

These case labels don't match up in any kind of obvious way to the
caller.  I think you need /* 32bit FNSAVE */ and /* 32bit FNSTENV */
here, and

> +            if ( !state->rex_prefix )

if ( !state->rex_prefix ) /* Convert 32bit prot to 32bit real/vm86 format */

here.

> +            {
> +                unsigned int fip = fpstate.env.mode.prot.fip +
> +                                   (fpstate.env.mode.prot.fcs << 4);
> +                unsigned int fdp = fpstate.env.mode.prot.fdp +
> +                                   (fpstate.env.mode.prot.fds << 4);
> +                unsigned int fop = fpstate.env.mode.prot.fop;
> +
> +                memset(&fpstate.env.mode, 0, sizeof(fpstate.env.mode));
> +                fpstate.env.mode.real.fip_lo = fip;
> +                fpstate.env.mode.real.fip_hi = fip >> 16;
> +                fpstate.env.mode.real.fop = fop;
> +                fpstate.env.mode.real.fdp_lo = fdp;
> +                fpstate.env.mode.real.fdp_hi = fdp >> 16;
> +            }
> +            memcpy(ptr, &fpstate.env, sizeof(fpstate.env));
> +            if ( bytes == sizeof(fpstate.env) )
> +                ptr = NULL;
> +            else
> +                ptr += sizeof(fpstate.env);
> +            break;
> +
> +        case sizeof(struct x87_env16):
> +        case sizeof(struct x87_env16) + sizeof(fpstate.freg):

Similarly, this wants /* 16bit FNSAVE */ and /* 16bit FNSTENV */.  I'm
tempted to suggest a literal 80 rather than sizeof(fpstate.freg) to
match the caller.

> +            if ( state->rex_prefix )

/* Convert 32bit prot to 16bit prot format */

> +            {
> +                struct x87_env16 *env = ptr;
> +
> +                env->fcw = fpstate.env.fcw;
> +                env->fsw = fpstate.env.fsw;
> +                env->ftw = fpstate.env.ftw;
> +                env->mode.prot.fip = fpstate.env.mode.prot.fip;
> +                env->mode.prot.fcs = fpstate.env.mode.prot.fcs;
> +                env->mode.prot.fdp = fpstate.env.mode.prot.fdp;
> +                env->mode.prot.fds = fpstate.env.mode.prot.fds;
> +            }
> +            else
> +            {

/* Convert 32bit prot to 16bit real/vm86 format */

> +                unsigned int fip = fpstate.env.mode.prot.fip +
> +                                   (fpstate.env.mode.prot.fcs << 4);
> +                unsigned int fdp = fpstate.env.mode.prot.fdp +
> +                                   (fpstate.env.mode.prot.fds << 4);
> +                struct x87_env16 env = {
> +                    .fcw = fpstate.env.fcw,
> +                    .fsw = fpstate.env.fsw,
> +                    .ftw = fpstate.env.ftw,
> +                    .mode.real.fip_lo = fip,
> +                    .mode.real.fip_hi = fip >> 16,
> +                    .mode.real.fop = fpstate.env.mode.prot.fop,
> +                    .mode.real.fdp_lo = fdp,
> +                    .mode.real.fdp_hi = fdp >> 16
> +                };
> +
> +                memcpy(ptr, &env, sizeof(env));
> +            }
> +            if ( bytes == sizeof(struct x87_env16) )
> +                ptr = NULL;
> +            else
> +                ptr += sizeof(struct x87_env16);
> +            break;
> +
> +        default:
> +            ASSERT_UNREACHABLE();
> +            return X86EMUL_UNHANDLEABLE;
> +        }
> +
> +        if ( ptr )
> +            memcpy(ptr, fpstate.freg, sizeof(fpstate.freg));

... we get here accidentally, and then copy stack rubble into the guest.

~Andrew

> +        break;
> +
> +#endif /* X86EMUL_NO_FPU */
> +
>      case blk_movdir:
>          switch ( bytes )
>          {
>
Jan Beulich May 13, 2020, 12:07 p.m. UTC | #3
On 08.05.2020 19:58, Andrew Cooper wrote:
> On 05/05/2020 09:15, Jan Beulich wrote:
>> To avoid introducing another boolean into emulator state, the
>> rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
>> info (affecting structure layout, albeit not size) to x86_emul_blk().
>>
>> Signed-off-by: Jan Beulich <jbeulich@suse.com>
>> ---
>> TBD: The full 16-bit padding fields in the 32-bit structures get filled
>>      with all ones by modern CPUs (i.e. other than the comment says for
> 
> You really mean "unlike" here, rather than "other".  They do not have
> the same meaning in this context.
> 
> (I think you're also missing a "what", which I'm guessing is just an
> oversight.)

Well, it's really s/other than/unlike what/ then afaics.

>>      FIP and FDP). We may want to mirror this as well (for the real mode
>>      variant), even if those fields' contents are unspecified.
> 
> This is surprising behaviour, but I expect it dates back to external x87
> processors and default MMIO behaviour.
> 
> If it is entirely consistent, it match be nice to match.  OTOH, the
> manuals are very clear that it is reserved, which I think gives us the
> liberty to use the easier implementation.

I've checked in on an AMD system meanwhile, and it's the same
there. The mentioned comment really refers to observations back
on a 386/387 pair. I think really old CPUs didn't write the
full 16-bit padding fields at all, and the 386/387 then started
writing full 32 bits of FIP and FDP alongside their "high 16
bits" secondary fields. I further assume that this "don't
write parts of the struct at all" behavior was considered
unsafe, or unhelpful when trying to write things out in bigger
chunks (ideally in full cachelines).

I'll leave this as is for now; we can still consider to store
all ones there later on.

>> --- a/xen/arch/x86/x86_emulate/x86_emulate.c
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.c
>> @@ -897,6 +900,50 @@ struct x86_emulate_state {
>>  #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
>>  #endif
>>  
>> +#ifndef X86EMUL_NO_FPU
>> +struct x87_env16 {
>> +    uint16_t fcw;
>> +    uint16_t fsw;
>> +    uint16_t ftw;
>> +    union {
>> +        struct {
>> +            uint16_t fip_lo;
>> +            uint16_t fop:11, :1, fip_hi:4;
>> +            uint16_t fdp_lo;
>> +            uint16_t :12, fdp_hi:4;
>> +        } real;
>> +        struct {
>> +            uint16_t fip;
>> +            uint16_t fcs;
>> +            uint16_t fdp;
>> +            uint16_t fds;
>> +        } prot;
>> +    } mode;
>> +};
>> +
>> +struct x87_env32 {
>> +    uint32_t fcw:16, :16;
>> +    uint32_t fsw:16, :16;
>> +    uint32_t ftw:16, :16;
> 
> uint16_t fcw, :16;
> uint16_t fsw, :16;
> uint16_t ftw, :16;
> 
> which reduces the number of 16 bit bitfields.

I'm unconvinced of this being helpful in any way. My goal here
was really to consistently use all uint16_t in the 16-bit
struct, and all uint32_t in the 32-bit one, not the least
after ...

>> +    union {
>> +        struct {
>> +            /* some CPUs/FPUs also store the full FIP here */
>> +            uint32_t fip_lo:16, :16;
>> +            uint32_t fop:11, :1, fip_hi:16, :4;
>> +            /* some CPUs/FPUs also store the full FDP here */
>> +            uint32_t fdp_lo:16, :16;
>> +            uint32_t :12, fdp_hi:16, :4;
> 
> Annoyingly, two of these lines can't use uint16_t.  I'm torn as to
> whether to suggest converting the other two which can.

... observing this. (Really I had it the other way around
initially. I'd be okay to switch back if there was a half
way compelling argument - reducing the number of 16-bit
bitfields doesn't sound like one to me, though, unless
there are implications from this that I don't see.)

>> @@ -11571,6 +11646,93 @@ int x86_emul_blk(
>>              *eflags |= X86_EFLAGS_ZF;
>>          break;
>>  
>> +#ifndef X86EMUL_NO_FPU
>> +
>> +    case blk_fst:
>> +        ASSERT(!data);
>> +
>> +        if ( bytes > sizeof(fpstate.env) )
>> +            asm ( "fnsave %0" : "=m" (fpstate) );
>> +        else
>> +            asm ( "fnstenv %0" : "=m" (fpstate.env) );
> 
> We have 4 possible sizes to deal with here - the 16 and 32bit formats of
> prot vs real/vm86 modes, and it is not clear (code wise) why
> sizeof(fpstate.env) is a suitable boundary.

See the definitons of struct x87_env16 and struct x87_env32:
They're intentionally in part using a union, to make more
obvious that in fact there's just two different sizes to deal
with.

> Given that these are legacy instructions and not a hotpath in the
> slightest, it is possibly faster (by removing the branch) and definitely
> more obvious to use fnsave unconditionally, and derive all of the
> smaller layouts that way.

I can accept the "not a hotpath" argument, but I'm against
using insns other than the intended one for no good reason.

> Critically however, it prevents us from needing a CVE/XSA if ... [bottom
> comment]

This is a legitimate concern, but imo not to be addressed by
using FNSAVE uniformly: There being fields which have
undefined contents even with FNSTENV (and which hence in
principle could not get written at all), I'm instead going
to memset() the entire structure upfront. I'll use 0 for
now, but using ~0 might be an option to fill the padding
fields (see above); the downside then would be that we'd
also fill the less-than-16-bit padding fields this way,
where hardware stores 0 (and where we are at risk of breaking
consumers which don't mask as necessary).

Jan
diff mbox series

Patch

--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -120,6 +120,7 @@  static inline bool xcr0_mask(uint64_t ma
 }
 
 #define cache_line_size() (cp.basic.clflush_size * 8)
+#define cpu_has_fpu        cp.basic.fpu
 #define cpu_has_mmx        cp.basic.mmx
 #define cpu_has_fxsr       cp.basic.fxsr
 #define cpu_has_sse        cp.basic.sse
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -748,6 +748,25 @@  static struct x86_emulate_ops emulops =
 
 #define MMAP_ADDR 0x100000
 
+/*
+ * 64-bit OSes may not (be able to) properly restore the two selectors in
+ * the FPU environment. Zap them so that memcmp() on two saved images will
+ * work regardless of whether a context switch occurred in the middle.
+ */
+static void zap_fpsel(unsigned int *env, bool is_32bit)
+{
+    if ( is_32bit )
+    {
+        env[4] &= ~0xffff;
+        env[6] &= ~0xffff;
+    }
+    else
+    {
+        env[2] &= ~0xffff;
+        env[3] &= ~0xffff;
+    }
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2394,6 +2413,62 @@  int main(int argc, char **argv)
         printf("okay\n");
     }
     else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing fnstenv 4(%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t three = 3;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fstenv %0"
+                       : "=m" (res[9]) : "m" (three) : "memory" );
+        zap_fpsel(&res[9], true);
+        instr[0] = 0xd9; instr[1] = 0x71; instr[2] = 0x04;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[8] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        zap_fpsel(&res[1], true);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res + 1, res + 9, 28) ||
+             res[8] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
+        printf("skipped\n");
+
+    printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
+    if ( stack_exec && cpu_has_fpu )
+    {
+        const uint16_t five = 5;
+
+        asm volatile ( "fninit\n\t"
+                       "fld1\n\t"
+                       "fidivs %1\n\t"
+                       "fsaves %0"
+                       : "=m" (res[25]) : "m" (five) : "memory" );
+        zap_fpsel(&res[25], false);
+        asm volatile ( "frstors %0" :: "m" (res[25]) : "memory" );
+        instr[0] = 0x66; instr[1] = 0xdd; instr[2] = 0x31;
+        regs.eip = (unsigned long)&instr[0];
+        regs.ecx = (unsigned long)res;
+        res[23] = 0xaa55aa55;
+        res[24] = 0xaa55aa55;
+        rc = x86_emulate(&ctxt, &emulops);
+        if ( (rc != X86EMUL_OKAY) ||
+             memcmp(res, res + 25, 94) ||
+             (res[23] >> 16) != 0xaa55 ||
+             res[24] != 0xaa55aa55 ||
+             (regs.eip != (unsigned long)&instr[3]) )
+            goto fail;
+        printf("okay\n");
+    }
+    else
         printf("skipped\n");
 
     printf("%-40s", "Testing movq %mm3,(%ecx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -856,6 +856,9 @@  struct x86_emulate_state {
     enum {
         blk_NONE,
         blk_enqcmd,
+#ifndef X86EMUL_NO_FPU
+        blk_fst, /* FNSTENV, FNSAVE */
+#endif
         blk_movdir,
     } blk;
     uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -897,6 +900,50 @@  struct x86_emulate_state {
 #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
+#ifndef X86EMUL_NO_FPU
+struct x87_env16 {
+    uint16_t fcw;
+    uint16_t fsw;
+    uint16_t ftw;
+    union {
+        struct {
+            uint16_t fip_lo;
+            uint16_t fop:11, :1, fip_hi:4;
+            uint16_t fdp_lo;
+            uint16_t :12, fdp_hi:4;
+        } real;
+        struct {
+            uint16_t fip;
+            uint16_t fcs;
+            uint16_t fdp;
+            uint16_t fds;
+        } prot;
+    } mode;
+};
+
+struct x87_env32 {
+    uint32_t fcw:16, :16;
+    uint32_t fsw:16, :16;
+    uint32_t ftw:16, :16;
+    union {
+        struct {
+            /* some CPUs/FPUs also store the full FIP here */
+            uint32_t fip_lo:16, :16;
+            uint32_t fop:11, :1, fip_hi:16, :4;
+            /* some CPUs/FPUs also store the full FDP here */
+            uint32_t fdp_lo:16, :16;
+            uint32_t :12, fdp_hi:16, :4;
+        } real;
+        struct {
+            uint32_t fip;
+            uint32_t fcs:16, fop:11, :5;
+            uint32_t fdp;
+            uint32_t fds:16, :16;
+        } prot;
+    } mode;
+};
+#endif
+
 typedef union {
     uint64_t mmx;
     uint64_t __attribute__ ((aligned(16))) xmm[2];
@@ -4912,9 +4959,19 @@  x86_emulate(
                     goto done;
                 emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
                 break;
-            case 6: /* fnstenv - TODO */
+            case 6: /* fnstenv */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /* REX is meaningless for this insn by this point. */
+                rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32)
+                                                 : sizeof(struct x87_env16),
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
                 state->fpu_ctrl = true;
-                goto unimplemented_insn;
+                break;
             case 7: /* fnstcw m2byte */
                 state->fpu_ctrl = true;
             fpu_memdst16:
@@ -5068,9 +5125,21 @@  x86_emulate(
                 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
                 break;
             case 4: /* frstor - TODO */
-            case 6: /* fnsave - TODO */
                 state->fpu_ctrl = true;
                 goto unimplemented_insn;
+            case 6: /* fnsave */
+                fail_if(!ops->blk);
+                state->blk = blk_fst;
+                /* REX is meaningless for this insn by this point. */
+                rex_prefix = in_protmode(ctxt, ops);
+                if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
+                                    op_bytes > 2 ? sizeof(struct x87_env32) + 80
+                                                 : sizeof(struct x87_env16) + 80,
+                                    &_regs.eflags,
+                                    state, ctxt)) != X86EMUL_OKAY )
+                    goto done;
+                state->fpu_ctrl = true;
+                break;
             case 7: /* fnstsw m2byte */
                 state->fpu_ctrl = true;
                 goto fpu_memdst16;
@@ -11542,6 +11611,12 @@  int x86_emul_blk(
     switch ( state->blk )
     {
         bool zf;
+        struct {
+            struct x87_env32 env;
+            struct {
+               uint8_t bytes[10];
+            } freg[8];
+        } fpstate;
 
         /*
          * Throughout this switch(), memory clobbers are used to compensate
@@ -11571,6 +11646,93 @@  int x86_emul_blk(
             *eflags |= X86_EFLAGS_ZF;
         break;
 
+#ifndef X86EMUL_NO_FPU
+
+    case blk_fst:
+        ASSERT(!data);
+
+        if ( bytes > sizeof(fpstate.env) )
+            asm ( "fnsave %0" : "=m" (fpstate) );
+        else
+            asm ( "fnstenv %0" : "=m" (fpstate.env) );
+
+        /* state->rex_prefix carries CR0.PE && !EFLAGS.VM setting */
+        switch ( bytes )
+        {
+        case sizeof(fpstate.env):
+        case sizeof(fpstate):
+            if ( !state->rex_prefix )
+            {
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                unsigned int fop = fpstate.env.mode.prot.fop;
+
+                memset(&fpstate.env.mode, 0, sizeof(fpstate.env.mode));
+                fpstate.env.mode.real.fip_lo = fip;
+                fpstate.env.mode.real.fip_hi = fip >> 16;
+                fpstate.env.mode.real.fop = fop;
+                fpstate.env.mode.real.fdp_lo = fdp;
+                fpstate.env.mode.real.fdp_hi = fdp >> 16;
+            }
+            memcpy(ptr, &fpstate.env, sizeof(fpstate.env));
+            if ( bytes == sizeof(fpstate.env) )
+                ptr = NULL;
+            else
+                ptr += sizeof(fpstate.env);
+            break;
+
+        case sizeof(struct x87_env16):
+        case sizeof(struct x87_env16) + sizeof(fpstate.freg):
+            if ( state->rex_prefix )
+            {
+                struct x87_env16 *env = ptr;
+
+                env->fcw = fpstate.env.fcw;
+                env->fsw = fpstate.env.fsw;
+                env->ftw = fpstate.env.ftw;
+                env->mode.prot.fip = fpstate.env.mode.prot.fip;
+                env->mode.prot.fcs = fpstate.env.mode.prot.fcs;
+                env->mode.prot.fdp = fpstate.env.mode.prot.fdp;
+                env->mode.prot.fds = fpstate.env.mode.prot.fds;
+            }
+            else
+            {
+                unsigned int fip = fpstate.env.mode.prot.fip +
+                                   (fpstate.env.mode.prot.fcs << 4);
+                unsigned int fdp = fpstate.env.mode.prot.fdp +
+                                   (fpstate.env.mode.prot.fds << 4);
+                struct x87_env16 env = {
+                    .fcw = fpstate.env.fcw,
+                    .fsw = fpstate.env.fsw,
+                    .ftw = fpstate.env.ftw,
+                    .mode.real.fip_lo = fip,
+                    .mode.real.fip_hi = fip >> 16,
+                    .mode.real.fop = fpstate.env.mode.prot.fop,
+                    .mode.real.fdp_lo = fdp,
+                    .mode.real.fdp_hi = fdp >> 16
+                };
+
+                memcpy(ptr, &env, sizeof(env));
+            }
+            if ( bytes == sizeof(struct x87_env16) )
+                ptr = NULL;
+            else
+                ptr += sizeof(struct x87_env16);
+            break;
+
+        default:
+            ASSERT_UNREACHABLE();
+            return X86EMUL_UNHANDLEABLE;
+        }
+
+        if ( ptr )
+            memcpy(ptr, fpstate.freg, sizeof(fpstate.freg));
+        break;
+
+#endif /* X86EMUL_NO_FPU */
+
     case blk_movdir:
         switch ( bytes )
         {