diff mbox

[v3,03/10] target/ppc: support for 32-bit carry and overflow

Message ID 1487763883-4877-4-git-send-email-nikunj@linux.vnet.ibm.com (mailing list archive)
State New, archived
Headers show

Commit Message

Nikunj A. Dadhania Feb. 22, 2017, 11:44 a.m. UTC
POWER ISA 3.0 adds CA32 and OV32 status in 64-bit mode. Add the flags
and corresponding defines.

Moreover, CA32 is updated when CA is updated and OV32 is updated when OV
is updated.

Arithmetic instructions:
    * Addition and Substractions:

        addic, addic., subfic, addc, subfc, adde, subfe, addme, subfme,
        addze, and subfze always updates CA and CA32.

        => CA reflects the carry out of bit 0 in 64-bit mode and out of
           bit 32 in 32-bit mode.
        => CA32 reflects the carry out of bit 32 independent of the
           mode.

        => SO and OV reflects overflow of the 64-bit result in 64-bit
           mode and overflow of the low-order 32-bit result in 32-bit
           mode
        => OV32 reflects overflow of the low-order 32-bit independent of
           the mode

    * Multiply Low and Divide:

        For mulld, divd, divde, divdu and divdeu: SO, OV, and OV32 bits
        reflects overflow of the 64-bit result

        For mullw, divw, divwe, divwu and divweu: SO, OV, and OV32 bits
        reflects overflow of the 32-bit result

     * Negate with OE=1 (nego)

       For 64-bit mode if the register RA contains
       0x8000_0000_0000_0000, OV and OV32 are set to 1.

       For 32-bit mode if the register RA contains 0x8000_0000, OV and
       OV32 are set to 1.

Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
---
 target/ppc/cpu.c            | 19 +++++++++++++++++--
 target/ppc/cpu.h            |  7 +++++++
 target/ppc/translate.c      | 29 ++++++++++++++++++++++++-----
 target/ppc/translate_init.c |  4 ++--
 4 files changed, 50 insertions(+), 9 deletions(-)

Comments

Richard Henderson Feb. 22, 2017, 5:17 p.m. UTC | #1
On 02/22/2017 10:44 PM, Nikunj A Dadhania wrote:
> POWER ISA 3.0 adds CA32 and OV32 status in 64-bit mode. Add the flags
> and corresponding defines.
>
> Moreover, CA32 is updated when CA is updated and OV32 is updated when OV
> is updated.
>
> Arithmetic instructions:
>     * Addition and Substractions:
>
>         addic, addic., subfic, addc, subfc, adde, subfe, addme, subfme,
>         addze, and subfze always updates CA and CA32.
>
>         => CA reflects the carry out of bit 0 in 64-bit mode and out of
>            bit 32 in 32-bit mode.
>         => CA32 reflects the carry out of bit 32 independent of the
>            mode.
>
>         => SO and OV reflects overflow of the 64-bit result in 64-bit
>            mode and overflow of the low-order 32-bit result in 32-bit
>            mode
>         => OV32 reflects overflow of the low-order 32-bit independent of
>            the mode
>
>     * Multiply Low and Divide:
>
>         For mulld, divd, divde, divdu and divdeu: SO, OV, and OV32 bits
>         reflects overflow of the 64-bit result
>
>         For mullw, divw, divwe, divwu and divweu: SO, OV, and OV32 bits
>         reflects overflow of the 32-bit result
>
>      * Negate with OE=1 (nego)
>
>        For 64-bit mode if the register RA contains
>        0x8000_0000_0000_0000, OV and OV32 are set to 1.
>
>        For 32-bit mode if the register RA contains 0x8000_0000, OV and
>        OV32 are set to 1.
>
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  target/ppc/cpu.c            | 19 +++++++++++++++++--
>  target/ppc/cpu.h            |  7 +++++++
>  target/ppc/translate.c      | 29 ++++++++++++++++++++++++-----
>  target/ppc/translate_init.c |  4 ++--
>  4 files changed, 50 insertions(+), 9 deletions(-)
>
> diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
> index de3004b..89c1ccb 100644
> --- a/target/ppc/cpu.c
> +++ b/target/ppc/cpu.c
> @@ -23,8 +23,15 @@
>
>  target_ulong cpu_read_xer(CPUPPCState *env)
>  {
> -    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
> +    target_ulong xer;
> +
> +    xer = env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
>          (env->ca << XER_CA);
> +
> +    if (is_isa300(env)) {
> +        xer |= (env->ov32 << XER_OV32) | (env->ca32 << XER_CA32);
> +    }
> +    return xer;
>  }
>
>  void cpu_write_xer(CPUPPCState *env, target_ulong xer)
> @@ -32,5 +39,13 @@ void cpu_write_xer(CPUPPCState *env, target_ulong xer)
>      env->so = (xer >> XER_SO) & 1;
>      env->ov = (xer >> XER_OV) & 1;
>      env->ca = (xer >> XER_CA) & 1;
> -    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> +    if (is_isa300(env)) {
> +        env->ov32 = (xer >> XER_OV32) & 1;
> +        env->ca32 = (xer >> XER_CA32) & 1;
> +        env->xer = xer & ~((1ul << XER_SO) |
> +                           (1ul << XER_OV) | (1ul << XER_CA) |
> +                           (1ul << XER_OV32) | (1ul << XER_CA32));
> +    } else {
> +        env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> +    }
>  }
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index b559b67..ee2eb45 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -965,6 +965,8 @@ struct CPUPPCState {
>      target_ulong so;
>      target_ulong ov;
>      target_ulong ca;
> +    target_ulong ov32;
> +    target_ulong ca32;
>      /* Reservation address */
>      target_ulong reserve_addr;
>      /* Reservation value */
> @@ -1372,11 +1374,15 @@ int ppc_compat_max_threads(PowerPCCPU *cpu);
>  #define XER_SO  31
>  #define XER_OV  30
>  #define XER_CA  29
> +#define XER_OV32  19
> +#define XER_CA32  18
>  #define XER_CMP  8
>  #define XER_BC   0
>  #define xer_so  (env->so)
>  #define xer_ov  (env->ov)
>  #define xer_ca  (env->ca)
> +#define xer_ov32  (env->ov)
> +#define xer_ca32  (env->ca)
>  #define xer_cmp ((env->xer >> XER_CMP) & 0xFF)
>  #define xer_bc  ((env->xer >> XER_BC)  & 0x7F)
>
> @@ -2343,6 +2349,7 @@ enum {
>
>  /*****************************************************************************/
>
> +#define is_isa300(ctx) (!!(ctx->insns_flags2 & PPC2_ISA300))
>  target_ulong cpu_read_xer(CPUPPCState *env);
>  void cpu_write_xer(CPUPPCState *env, target_ulong xer);
>
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index b09e16f..c9f6768 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -71,7 +71,7 @@ static TCGv cpu_lr;
>  #if defined(TARGET_PPC64)
>  static TCGv cpu_cfar;
>  #endif
> -static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca;
> +static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
>  static TCGv cpu_reserve;
>  static TCGv cpu_fpscr;
>  static TCGv_i32 cpu_access_type;
> @@ -173,6 +173,10 @@ void ppc_translate_init(void)
>                                  offsetof(CPUPPCState, ov), "OV");
>      cpu_ca = tcg_global_mem_new(cpu_env,
>                                  offsetof(CPUPPCState, ca), "CA");
> +    cpu_ov32 = tcg_global_mem_new(cpu_env,
> +                                  offsetof(CPUPPCState, ov32), "OV32");
> +    cpu_ca32 = tcg_global_mem_new(cpu_env,
> +                                  offsetof(CPUPPCState, ca32), "CA32");
>
>      cpu_reserve = tcg_global_mem_new(cpu_env,
>                                       offsetof(CPUPPCState, reserve_addr),
> @@ -3703,7 +3707,7 @@ static void gen_tdi(DisasContext *ctx)
>
>  /***                          Processor control                            ***/
>
> -static void gen_read_xer(TCGv dst)
> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>  {
>      TCGv t0 = tcg_temp_new();
>      TCGv t1 = tcg_temp_new();
> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>      tcg_gen_or_tl(t0, t0, t1);
>      tcg_gen_or_tl(dst, dst, t2);
>      tcg_gen_or_tl(dst, dst, t0);
> +    if (is_isa300(ctx)) {
> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
> +        tcg_gen_or_tl(dst, dst, t0);
> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
> +        tcg_gen_or_tl(dst, dst, t0);
> +    }
>      tcg_temp_free(t0);
>      tcg_temp_free(t1);
>      tcg_temp_free(t2);
>  }
>
> -static void gen_write_xer(TCGv src)
> +static void gen_write_xer(DisasContext *ctx, TCGv src)
>  {
> -    tcg_gen_andi_tl(cpu_xer, src,
> -                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    if (is_isa300(ctx)) {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) |
> +                          (1u << XER_OV) | (1u << XER_OV32) |
> +                          (1u << XER_CA) | (1u << XER_CA32)));
> +        tcg_gen_extract_tl(cpu_ov32, src, XER_OV32, 1);
> +        tcg_gen_extract_tl(cpu_ca32, src, XER_CA32, 1);
> +    } else {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    }

You just determined that power8 does not store all of the bits that are 
written.  We ought to clear more bits here.  Indeed I suspect that the ANDI 
will be able to be shared between these paths.


r~
Richard Henderson Feb. 22, 2017, 5:20 p.m. UTC | #2
Bah.  Hit return too soon...

On 02/22/2017 10:44 PM, Nikunj A Dadhania wrote:
> -static void gen_read_xer(TCGv dst)
> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>  {
>      TCGv t0 = tcg_temp_new();
>      TCGv t1 = tcg_temp_new();
> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>      tcg_gen_or_tl(t0, t0, t1);
>      tcg_gen_or_tl(dst, dst, t2);
>      tcg_gen_or_tl(dst, dst, t0);
> +    if (is_isa300(ctx)) {
> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
> +        tcg_gen_or_tl(dst, dst, t0);
> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
> +        tcg_gen_or_tl(dst, dst, t0);
> +    }
>      tcg_temp_free(t0);
>      tcg_temp_free(t1);
>      tcg_temp_free(t2);
>  }
>
> -static void gen_write_xer(TCGv src)
> +static void gen_write_xer(DisasContext *ctx, TCGv src)
>  {
> -    tcg_gen_andi_tl(cpu_xer, src,
> -                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    if (is_isa300(ctx)) {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) |
> +                          (1u << XER_OV) | (1u << XER_OV32) |
> +                          (1u << XER_CA) | (1u << XER_CA32)));
> +        tcg_gen_extract_tl(cpu_ov32, src, XER_OV32, 1);
> +        tcg_gen_extract_tl(cpu_ca32, src, XER_CA32, 1);
> +    } else {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    }
>      tcg_gen_extract_tl(cpu_so, src, XER_SO, 1);
>      tcg_gen_extract_tl(cpu_ov, src, XER_OV, 1);
>      tcg_gen_extract_tl(cpu_ca, src, XER_CA, 1);

These functions are becoming quite large.  Are they performance critical enough 
that they need to stay as inline code, or should they be moved to helpers and 
share code with cpu_read/write_xer?


r~
David Gibson Feb. 23, 2017, 3:21 a.m. UTC | #3
On Wed, Feb 22, 2017 at 05:14:36PM +0530, Nikunj A Dadhania wrote:
> POWER ISA 3.0 adds CA32 and OV32 status in 64-bit mode. Add the flags
> and corresponding defines.
> 
> Moreover, CA32 is updated when CA is updated and OV32 is updated when OV
> is updated.
> 
> Arithmetic instructions:
>     * Addition and Substractions:
> 
>         addic, addic., subfic, addc, subfc, adde, subfe, addme, subfme,
>         addze, and subfze always updates CA and CA32.
> 
>         => CA reflects the carry out of bit 0 in 64-bit mode and out of
>            bit 32 in 32-bit mode.
>         => CA32 reflects the carry out of bit 32 independent of the
>            mode.
> 
>         => SO and OV reflects overflow of the 64-bit result in 64-bit
>            mode and overflow of the low-order 32-bit result in 32-bit
>            mode
>         => OV32 reflects overflow of the low-order 32-bit independent of
>            the mode
> 
>     * Multiply Low and Divide:
> 
>         For mulld, divd, divde, divdu and divdeu: SO, OV, and OV32 bits
>         reflects overflow of the 64-bit result
> 
>         For mullw, divw, divwe, divwu and divweu: SO, OV, and OV32 bits
>         reflects overflow of the 32-bit result
> 
>      * Negate with OE=1 (nego)
> 
>        For 64-bit mode if the register RA contains
>        0x8000_0000_0000_0000, OV and OV32 are set to 1.
> 
>        For 32-bit mode if the register RA contains 0x8000_0000, OV and
>        OV32 are set to 1.
> 
> Signed-off-by: Nikunj A Dadhania <nikunj@linux.vnet.ibm.com>
> ---
>  target/ppc/cpu.c            | 19 +++++++++++++++++--
>  target/ppc/cpu.h            |  7 +++++++
>  target/ppc/translate.c      | 29 ++++++++++++++++++++++++-----
>  target/ppc/translate_init.c |  4 ++--
>  4 files changed, 50 insertions(+), 9 deletions(-)
> 
> diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
> index de3004b..89c1ccb 100644
> --- a/target/ppc/cpu.c
> +++ b/target/ppc/cpu.c
> @@ -23,8 +23,15 @@
>  
>  target_ulong cpu_read_xer(CPUPPCState *env)
>  {
> -    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
> +    target_ulong xer;
> +
> +    xer = env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
>          (env->ca << XER_CA);
> +
> +    if (is_isa300(env)) {
> +        xer |= (env->ov32 << XER_OV32) | (env->ca32 << XER_CA32);
> +    }
> +    return xer;
>  }
>  
>  void cpu_write_xer(CPUPPCState *env, target_ulong xer)
> @@ -32,5 +39,13 @@ void cpu_write_xer(CPUPPCState *env, target_ulong xer)
>      env->so = (xer >> XER_SO) & 1;
>      env->ov = (xer >> XER_OV) & 1;
>      env->ca = (xer >> XER_CA) & 1;
> -    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> +    if (is_isa300(env)) {
> +        env->ov32 = (xer >> XER_OV32) & 1;
> +        env->ca32 = (xer >> XER_CA32) & 1;

I think these might as well be unconditional - as long as the read_xer
doesn't read the bits back, the guest won't care that we track them in
internal state.

I'm also wondering if it might be worth adding a xer_mask to the env,
instead of explicitly checking isa300 all over the place.

> +        env->xer = xer & ~((1ul << XER_SO) |
> +                           (1ul << XER_OV) | (1ul << XER_CA) |
> +                           (1ul << XER_OV32) | (1ul << XER_CA32));
> +    } else {
> +        env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> +    }

And you can definitely use the stricer mask for both archs.  If it's
ISA300, you've stashed them elsewhere, if it's not those bits are
invalid anyway,

(Incidentally given the modern balance between the cost of
instructions and cachelines, I wonder if all these split out bits of
the XER are a good idea in any case, but that would be a big change
out  of scope for what you're attempting here)

>  }
> diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
> index b559b67..ee2eb45 100644
> --- a/target/ppc/cpu.h
> +++ b/target/ppc/cpu.h
> @@ -965,6 +965,8 @@ struct CPUPPCState {
>      target_ulong so;
>      target_ulong ov;
>      target_ulong ca;
> +    target_ulong ov32;
> +    target_ulong ca32;
>      /* Reservation address */
>      target_ulong reserve_addr;
>      /* Reservation value */
> @@ -1372,11 +1374,15 @@ int ppc_compat_max_threads(PowerPCCPU *cpu);
>  #define XER_SO  31
>  #define XER_OV  30
>  #define XER_CA  29
> +#define XER_OV32  19
> +#define XER_CA32  18
>  #define XER_CMP  8
>  #define XER_BC   0
>  #define xer_so  (env->so)
>  #define xer_ov  (env->ov)
>  #define xer_ca  (env->ca)
> +#define xer_ov32  (env->ov)
> +#define xer_ca32  (env->ca)
>  #define xer_cmp ((env->xer >> XER_CMP) & 0xFF)
>  #define xer_bc  ((env->xer >> XER_BC)  & 0x7F)
>  
> @@ -2343,6 +2349,7 @@ enum {
>  
>  /*****************************************************************************/
>  
> +#define is_isa300(ctx) (!!(ctx->insns_flags2 & PPC2_ISA300))
>  target_ulong cpu_read_xer(CPUPPCState *env);
>  void cpu_write_xer(CPUPPCState *env, target_ulong xer);
>  
> diff --git a/target/ppc/translate.c b/target/ppc/translate.c
> index b09e16f..c9f6768 100644
> --- a/target/ppc/translate.c
> +++ b/target/ppc/translate.c
> @@ -71,7 +71,7 @@ static TCGv cpu_lr;
>  #if defined(TARGET_PPC64)
>  static TCGv cpu_cfar;
>  #endif
> -static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca;
> +static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
>  static TCGv cpu_reserve;
>  static TCGv cpu_fpscr;
>  static TCGv_i32 cpu_access_type;
> @@ -173,6 +173,10 @@ void ppc_translate_init(void)
>                                  offsetof(CPUPPCState, ov), "OV");
>      cpu_ca = tcg_global_mem_new(cpu_env,
>                                  offsetof(CPUPPCState, ca), "CA");
> +    cpu_ov32 = tcg_global_mem_new(cpu_env,
> +                                  offsetof(CPUPPCState, ov32), "OV32");
> +    cpu_ca32 = tcg_global_mem_new(cpu_env,
> +                                  offsetof(CPUPPCState, ca32), "CA32");
>  
>      cpu_reserve = tcg_global_mem_new(cpu_env,
>                                       offsetof(CPUPPCState, reserve_addr),
> @@ -3703,7 +3707,7 @@ static void gen_tdi(DisasContext *ctx)
>  
>  /***                          Processor control                            ***/
>  
> -static void gen_read_xer(TCGv dst)
> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>  {
>      TCGv t0 = tcg_temp_new();
>      TCGv t1 = tcg_temp_new();
> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>      tcg_gen_or_tl(t0, t0, t1);
>      tcg_gen_or_tl(dst, dst, t2);
>      tcg_gen_or_tl(dst, dst, t0);
> +    if (is_isa300(ctx)) {
> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
> +        tcg_gen_or_tl(dst, dst, t0);
> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
> +        tcg_gen_or_tl(dst, dst, t0);

Could you use 2 deposits here, instead of 2 shifts and 2 ors?

> +    }
>      tcg_temp_free(t0);
>      tcg_temp_free(t1);
>      tcg_temp_free(t2);
>  }
>  
> -static void gen_write_xer(TCGv src)
> +static void gen_write_xer(DisasContext *ctx, TCGv src)
>  {
> -    tcg_gen_andi_tl(cpu_xer, src,
> -                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    if (is_isa300(ctx)) {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) |
> +                          (1u << XER_OV) | (1u << XER_OV32) |
> +                          (1u << XER_CA) | (1u << XER_CA32)));
> +        tcg_gen_extract_tl(cpu_ov32, src, XER_OV32, 1);
> +        tcg_gen_extract_tl(cpu_ca32, src, XER_CA32, 1);
> +    } else {
> +        tcg_gen_andi_tl(cpu_xer, src,
> +                        ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
> +    }
>      tcg_gen_extract_tl(cpu_so, src, XER_SO, 1);
>      tcg_gen_extract_tl(cpu_ov, src, XER_OV, 1);
>      tcg_gen_extract_tl(cpu_ca, src, XER_CA, 1);
> diff --git a/target/ppc/translate_init.c b/target/ppc/translate_init.c
> index be35cbd..eb667bb 100644
> --- a/target/ppc/translate_init.c
> +++ b/target/ppc/translate_init.c
> @@ -107,12 +107,12 @@ static void spr_access_nop(DisasContext *ctx, int sprn, int gprn)
>  /* XER */
>  static void spr_read_xer (DisasContext *ctx, int gprn, int sprn)
>  {
> -    gen_read_xer(cpu_gpr[gprn]);
> +    gen_read_xer(ctx, cpu_gpr[gprn]);
>  }
>  
>  static void spr_write_xer (DisasContext *ctx, int sprn, int gprn)
>  {
> -    gen_write_xer(cpu_gpr[gprn]);
> +    gen_write_xer(ctx, cpu_gpr[gprn]);
>  }
>  
>  /* LR */
Nikunj A. Dadhania Feb. 23, 2017, 5:09 a.m. UTC | #4
David Gibson <david@gibson.dropbear.id.au> writes:
>> 
>> diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
>> index de3004b..89c1ccb 100644
>> --- a/target/ppc/cpu.c
>> +++ b/target/ppc/cpu.c
>> @@ -23,8 +23,15 @@
>>  
>>  target_ulong cpu_read_xer(CPUPPCState *env)
>>  {
>> -    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
>> +    target_ulong xer;
>> +
>> +    xer = env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
>>          (env->ca << XER_CA);
>> +
>> +    if (is_isa300(env)) {
>> +        xer |= (env->ov32 << XER_OV32) | (env->ca32 << XER_CA32);
>> +    }
>> +    return xer;
>>  }
>>  
>>  void cpu_write_xer(CPUPPCState *env, target_ulong xer)
>> @@ -32,5 +39,13 @@ void cpu_write_xer(CPUPPCState *env, target_ulong xer)
>>      env->so = (xer >> XER_SO) & 1;
>>      env->ov = (xer >> XER_OV) & 1;
>>      env->ca = (xer >> XER_CA) & 1;
>> -    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
>> +    if (is_isa300(env)) {
>> +        env->ov32 = (xer >> XER_OV32) & 1;
>> +        env->ca32 = (xer >> XER_CA32) & 1;
>
> I think these might as well be unconditional - as long as the read_xer
> doesn't read the bits back, the guest won't care that we track them in
> internal state.

Sure.


> I'm also wondering if it might be worth adding a xer_mask to the env,
> instead of explicitly checking isa300 all over the place.

Let me try that out.

Can we also update ov32/ca32 in all the arithmetic operations as if its
supported. And as you suggested, whenever there is a read attempted,
only give relevant bits back(xer_mask). This would save lot of
conditions in translations (couple of more tcg-ops for non-isa300)

>
>> +        env->xer = xer & ~((1ul << XER_SO) |
>> +                           (1ul << XER_OV) | (1ul << XER_CA) |
>> +                           (1ul << XER_OV32) | (1ul << XER_CA32));
>> +    } else {
>> +        env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
>> +    }
>
> And you can definitely use the stricer mask for both archs.  If it's
> ISA300, you've stashed them elsewhere, if it's not those bits are
> invalid anyway,
>
> (Incidentally given the modern balance between the cost of
> instructions and cachelines, I wonder if all these split out bits of
> the XER are a good idea in any case, but that would be a big change
> out  of scope for what you're attempting here)

Will have a look at this after finishing isa300. I have faced issues
with RISU wrt having the state stashed in different tcg variables.

Regards
Nikunj
David Gibson Feb. 23, 2017, 5:32 a.m. UTC | #5
On Thu, Feb 23, 2017 at 10:39:47AM +0530, Nikunj A Dadhania wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> >> 
> >> diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
> >> index de3004b..89c1ccb 100644
> >> --- a/target/ppc/cpu.c
> >> +++ b/target/ppc/cpu.c
> >> @@ -23,8 +23,15 @@
> >>  
> >>  target_ulong cpu_read_xer(CPUPPCState *env)
> >>  {
> >> -    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
> >> +    target_ulong xer;
> >> +
> >> +    xer = env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
> >>          (env->ca << XER_CA);
> >> +
> >> +    if (is_isa300(env)) {
> >> +        xer |= (env->ov32 << XER_OV32) | (env->ca32 << XER_CA32);
> >> +    }
> >> +    return xer;
> >>  }
> >>  
> >>  void cpu_write_xer(CPUPPCState *env, target_ulong xer)
> >> @@ -32,5 +39,13 @@ void cpu_write_xer(CPUPPCState *env, target_ulong xer)
> >>      env->so = (xer >> XER_SO) & 1;
> >>      env->ov = (xer >> XER_OV) & 1;
> >>      env->ca = (xer >> XER_CA) & 1;
> >> -    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> >> +    if (is_isa300(env)) {
> >> +        env->ov32 = (xer >> XER_OV32) & 1;
> >> +        env->ca32 = (xer >> XER_CA32) & 1;
> >
> > I think these might as well be unconditional - as long as the read_xer
> > doesn't read the bits back, the guest won't care that we track them in
> > internal state.
> 
> Sure.
> 
> 
> > I'm also wondering if it might be worth adding a xer_mask to the env,
> > instead of explicitly checking isa300 all over the place.
> 
> Let me try that out.
> 
> Can we also update ov32/ca32 in all the arithmetic operations as if its
> supported. And as you suggested, whenever there is a read attempted,
> only give relevant bits back(xer_mask). This would save lot of
> conditions in translations (couple of more tcg-ops for non-isa300)

So if it was a straight trade-off between conditions and math
operations, I'd pick the extra math every time.  However, in this case
we're trading off math on every execution, versus a condition only on
translation, which should occur less often.  So in this case I suspect
it's worth keeping the conditional.

> >> +        env->xer = xer & ~((1ul << XER_SO) |
> >> +                           (1ul << XER_OV) | (1ul << XER_CA) |
> >> +                           (1ul << XER_OV32) | (1ul << XER_CA32));
> >> +    } else {
> >> +        env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
> >> +    }
> >
> > And you can definitely use the stricer mask for both archs.  If it's
> > ISA300, you've stashed them elsewhere, if it's not those bits are
> > invalid anyway,
> >
> > (Incidentally given the modern balance between the cost of
> > instructions and cachelines, I wonder if all these split out bits of
> > the XER are a good idea in any case, but that would be a big change
> > out  of scope for what you're attempting here)
> 
> Will have a look at this after finishing isa300. I have faced issues
> with RISU wrt having the state stashed in different tcg variables.

Thanks.
Nikunj A. Dadhania Feb. 23, 2017, 6:40 a.m. UTC | #6
Richard Henderson <rth@twiddle.net> writes:

> Bah.  Hit return too soon...
>
> On 02/22/2017 10:44 PM, Nikunj A Dadhania wrote:
>> -static void gen_read_xer(TCGv dst)
>> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>>  {
>>      TCGv t0 = tcg_temp_new();
>>      TCGv t1 = tcg_temp_new();
>> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>>      tcg_gen_or_tl(t0, t0, t1);
>>      tcg_gen_or_tl(dst, dst, t2);
>>      tcg_gen_or_tl(dst, dst, t0);
>> +    if (is_isa300(ctx)) {
>> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
>> +        tcg_gen_or_tl(dst, dst, t0);
>> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
>> +        tcg_gen_or_tl(dst, dst, t0);
>> +    }
>>      tcg_temp_free(t0);
>>      tcg_temp_free(t1);
>>      tcg_temp_free(t2);
>>  }
>>
>> -static void gen_write_xer(TCGv src)
>> +static void gen_write_xer(DisasContext *ctx, TCGv src)
>>  {
>> -    tcg_gen_andi_tl(cpu_xer, src,
>> -                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
>> +    if (is_isa300(ctx)) {
>> +        tcg_gen_andi_tl(cpu_xer, src,
>> +                        ~((1u << XER_SO) |
>> +                          (1u << XER_OV) | (1u << XER_OV32) |
>> +                          (1u << XER_CA) | (1u << XER_CA32)));
>> +        tcg_gen_extract_tl(cpu_ov32, src, XER_OV32, 1);
>> +        tcg_gen_extract_tl(cpu_ca32, src, XER_CA32, 1);
>> +    } else {
>> +        tcg_gen_andi_tl(cpu_xer, src,
>> +                        ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
>> +    }
>>      tcg_gen_extract_tl(cpu_so, src, XER_SO, 1);
>>      tcg_gen_extract_tl(cpu_ov, src, XER_OV, 1);
>>      tcg_gen_extract_tl(cpu_ca, src, XER_CA, 1);
>
> These functions are becoming quite large.  Are they performance critical enough 
> that they need to stay as inline code, or should they be moved to helpers and 
> share code with cpu_read/write_xer?

Just to boot to login prompt, these are the numbers for gen_read/write_xer:

helper_myprint - rd_count 231103, wr_count 68897

And it keeps on incrementing, maybe scope of optimization here.

Regards
Nikunj
Nikunj A. Dadhania Feb. 23, 2017, 7:02 a.m. UTC | #7
David Gibson <david@gibson.dropbear.id.au> writes:

> -static void gen_read_xer(TCGv dst)
>> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>>  {
>>      TCGv t0 = tcg_temp_new();
>>      TCGv t1 = tcg_temp_new();
>> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>>      tcg_gen_or_tl(t0, t0, t1);
>>      tcg_gen_or_tl(dst, dst, t2);
>>      tcg_gen_or_tl(dst, dst, t0);
>> +    if (is_isa300(ctx)) {
>> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
>> +        tcg_gen_or_tl(dst, dst, t0);
>> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
>> +        tcg_gen_or_tl(dst, dst, t0);
>
> Could you use 2 deposits here, instead of 2 shifts and 2 ors?

I checked the implementation of tcg_gen_deposit_i64, resultant will have much
more than 2 shifts + 2 ors.

Regards,
Nikunj
David Gibson Feb. 23, 2017, 9:29 a.m. UTC | #8
On Thu, Feb 23, 2017 at 12:32:44PM +0530, Nikunj A Dadhania wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
> 
> > -static void gen_read_xer(TCGv dst)
> >> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
> >>  {
> >>      TCGv t0 = tcg_temp_new();
> >>      TCGv t1 = tcg_temp_new();
> >> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
> >>      tcg_gen_or_tl(t0, t0, t1);
> >>      tcg_gen_or_tl(dst, dst, t2);
> >>      tcg_gen_or_tl(dst, dst, t0);
> >> +    if (is_isa300(ctx)) {
> >> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
> >> +        tcg_gen_or_tl(dst, dst, t0);
> >> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
> >> +        tcg_gen_or_tl(dst, dst, t0);
> >
> > Could you use 2 deposits here, instead of 2 shifts and 2 ors?
> 
> I checked the implementation of tcg_gen_deposit_i64, resultant will have much
> more than 2 shifts + 2 ors.

Ok, fair enough.
Richard Henderson Feb. 23, 2017, 10:34 p.m. UTC | #9
On 02/23/2017 05:40 PM, Nikunj A Dadhania wrote:
> Richard Henderson <rth@twiddle.net> writes:
>> These functions are becoming quite large.  Are they performance critical enough
>> that they need to stay as inline code, or should they be moved to helpers and
>> share code with cpu_read/write_xer?
>
> Just to boot to login prompt, these are the numbers for gen_read/write_xer:
>
> helper_myprint - rd_count 231103, wr_count 68897
>
> And it keeps on incrementing, maybe scope of optimization here.

That's not very large considering the total number of instructions executed 
during a boot to prompt.

Thoughts, David?


r~
Richard Henderson Feb. 23, 2017, 10:36 p.m. UTC | #10
On 02/23/2017 06:02 PM, Nikunj A Dadhania wrote:
> David Gibson <david@gibson.dropbear.id.au> writes:
>
>> -static void gen_read_xer(TCGv dst)
>>> +static void gen_read_xer(DisasContext *ctx, TCGv dst)
>>>  {
>>>      TCGv t0 = tcg_temp_new();
>>>      TCGv t1 = tcg_temp_new();
>>> @@ -3715,15 +3719,30 @@ static void gen_read_xer(TCGv dst)
>>>      tcg_gen_or_tl(t0, t0, t1);
>>>      tcg_gen_or_tl(dst, dst, t2);
>>>      tcg_gen_or_tl(dst, dst, t0);
>>> +    if (is_isa300(ctx)) {
>>> +        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
>>> +        tcg_gen_or_tl(dst, dst, t0);
>>> +        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
>>> +        tcg_gen_or_tl(dst, dst, t0);
>>
>> Could you use 2 deposits here, instead of 2 shifts and 2 ors?
>
> I checked the implementation of tcg_gen_deposit_i64, resultant will have much
> more than 2 shifts + 2 ors.

Well, that depends on the host.  For a host that implements deposit, like 
aarch64 or ppc64, it will be one instruction.


r~
David Gibson Feb. 23, 2017, 10:53 p.m. UTC | #11
On Fri, Feb 24, 2017 at 09:34:32AM +1100, Richard Henderson wrote:
> On 02/23/2017 05:40 PM, Nikunj A Dadhania wrote:
> > Richard Henderson <rth@twiddle.net> writes:
> > > These functions are becoming quite large.  Are they performance critical enough
> > > that they need to stay as inline code, or should they be moved to helpers and
> > > share code with cpu_read/write_xer?
> > 
> > Just to boot to login prompt, these are the numbers for gen_read/write_xer:
> > 
> > helper_myprint - rd_count 231103, wr_count 68897
> > 
> > And it keeps on incrementing, maybe scope of optimization here.
> 
> That's not very large considering the total number of instructions executed
> during a boot to prompt.
> 
> Thoughts, David?

Hm, I'm not clear if that's the number of executions, or the number of
translations.
Nikunj Dadhania Feb. 24, 2017, 12:41 a.m. UTC | #12
On 24 February 2017 at 04:23, David Gibson <david@gibson.dropbear.id.au> wrote:
> On Fri, Feb 24, 2017 at 09:34:32AM +1100, Richard Henderson wrote:
>> On 02/23/2017 05:40 PM, Nikunj A Dadhania wrote:
>> > Richard Henderson <rth@twiddle.net> writes:
>> > > These functions are becoming quite large.  Are they performance critical enough
>> > > that they need to stay as inline code, or should they be moved to helpers and
>> > > share code with cpu_read/write_xer?
>> >
>> > Just to boot to login prompt, these are the numbers for gen_read/write_xer:
>> >
>> > helper_myprint - rd_count 231103, wr_count 68897
>> >
>> > And it keeps on incrementing, maybe scope of optimization here.
>>
>> That's not very large considering the total number of instructions executed
>> during a boot to prompt.
>>
>> Thoughts, David?
>
> Hm, I'm not clear if that's the number of executions, or the number of
> translations.

That is number of executions.

Regards
Nikunj
David Gibson Feb. 24, 2017, 4:50 a.m. UTC | #13
On Fri, Feb 24, 2017 at 06:11:30AM +0530, Nikunj Dadhania wrote:
> On 24 February 2017 at 04:23, David Gibson <david@gibson.dropbear.id.au> wrote:
> > On Fri, Feb 24, 2017 at 09:34:32AM +1100, Richard Henderson wrote:
> >> On 02/23/2017 05:40 PM, Nikunj A Dadhania wrote:
> >> > Richard Henderson <rth@twiddle.net> writes:
> >> > > These functions are becoming quite large.  Are they performance critical enough
> >> > > that they need to stay as inline code, or should they be moved to helpers and
> >> > > share code with cpu_read/write_xer?
> >> >
> >> > Just to boot to login prompt, these are the numbers for gen_read/write_xer:
> >> >
> >> > helper_myprint - rd_count 231103, wr_count 68897
> >> >
> >> > And it keeps on incrementing, maybe scope of optimization here.
> >>
> >> That's not very large considering the total number of instructions executed
> >> during a boot to prompt.
> >>
> >> Thoughts, David?
> >
> > Hm, I'm not clear if that's the number of executions, or the number of
> > translations.
> 
> That is number of executions.

Ok, I guess that's not that big, then.  I guess moving them into
helpers would make sense.

Although I guess they'd shrink right down again if we put an
env->xer_mask in.  Thoughts on that option Richard?
Richard Henderson Feb. 24, 2017, 6:30 a.m. UTC | #14
On 02/24/2017 03:50 PM, David Gibson wrote:
> Although I guess they'd shrink right down again if we put an
> env->xer_mask in.  Thoughts on that option Richard?

Why would xer_mask shrink the code?  I can't see that we'd be able to eliminate 
any code using the mask.


r~
David Gibson Feb. 27, 2017, 1:39 a.m. UTC | #15
On Fri, Feb 24, 2017 at 05:30:23PM +1100, Richard Henderson wrote:
> On 02/24/2017 03:50 PM, David Gibson wrote:
> > Although I guess they'd shrink right down again if we put an
> > env->xer_mask in.  Thoughts on that option Richard?
> 
> Why would xer_mask shrink the code?  I can't see that we'd be able to
> eliminate any code using the mask.

Uh.. I think I was thinking about the qemu code, not the generated
code.  It means we could unconditionally and with the xer_mask in some
places, rather than having fiddly conditionals.
diff mbox

Patch

diff --git a/target/ppc/cpu.c b/target/ppc/cpu.c
index de3004b..89c1ccb 100644
--- a/target/ppc/cpu.c
+++ b/target/ppc/cpu.c
@@ -23,8 +23,15 @@ 
 
 target_ulong cpu_read_xer(CPUPPCState *env)
 {
-    return env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
+    target_ulong xer;
+
+    xer = env->xer | (env->so << XER_SO) | (env->ov << XER_OV) |
         (env->ca << XER_CA);
+
+    if (is_isa300(env)) {
+        xer |= (env->ov32 << XER_OV32) | (env->ca32 << XER_CA32);
+    }
+    return xer;
 }
 
 void cpu_write_xer(CPUPPCState *env, target_ulong xer)
@@ -32,5 +39,13 @@  void cpu_write_xer(CPUPPCState *env, target_ulong xer)
     env->so = (xer >> XER_SO) & 1;
     env->ov = (xer >> XER_OV) & 1;
     env->ca = (xer >> XER_CA) & 1;
-    env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
+    if (is_isa300(env)) {
+        env->ov32 = (xer >> XER_OV32) & 1;
+        env->ca32 = (xer >> XER_CA32) & 1;
+        env->xer = xer & ~((1ul << XER_SO) |
+                           (1ul << XER_OV) | (1ul << XER_CA) |
+                           (1ul << XER_OV32) | (1ul << XER_CA32));
+    } else {
+        env->xer = xer & ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA));
+    }
 }
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index b559b67..ee2eb45 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -965,6 +965,8 @@  struct CPUPPCState {
     target_ulong so;
     target_ulong ov;
     target_ulong ca;
+    target_ulong ov32;
+    target_ulong ca32;
     /* Reservation address */
     target_ulong reserve_addr;
     /* Reservation value */
@@ -1372,11 +1374,15 @@  int ppc_compat_max_threads(PowerPCCPU *cpu);
 #define XER_SO  31
 #define XER_OV  30
 #define XER_CA  29
+#define XER_OV32  19
+#define XER_CA32  18
 #define XER_CMP  8
 #define XER_BC   0
 #define xer_so  (env->so)
 #define xer_ov  (env->ov)
 #define xer_ca  (env->ca)
+#define xer_ov32  (env->ov)
+#define xer_ca32  (env->ca)
 #define xer_cmp ((env->xer >> XER_CMP) & 0xFF)
 #define xer_bc  ((env->xer >> XER_BC)  & 0x7F)
 
@@ -2343,6 +2349,7 @@  enum {
 
 /*****************************************************************************/
 
+#define is_isa300(ctx) (!!(ctx->insns_flags2 & PPC2_ISA300))
 target_ulong cpu_read_xer(CPUPPCState *env);
 void cpu_write_xer(CPUPPCState *env, target_ulong xer);
 
diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index b09e16f..c9f6768 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -71,7 +71,7 @@  static TCGv cpu_lr;
 #if defined(TARGET_PPC64)
 static TCGv cpu_cfar;
 #endif
-static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca;
+static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
 static TCGv cpu_reserve;
 static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
@@ -173,6 +173,10 @@  void ppc_translate_init(void)
                                 offsetof(CPUPPCState, ov), "OV");
     cpu_ca = tcg_global_mem_new(cpu_env,
                                 offsetof(CPUPPCState, ca), "CA");
+    cpu_ov32 = tcg_global_mem_new(cpu_env,
+                                  offsetof(CPUPPCState, ov32), "OV32");
+    cpu_ca32 = tcg_global_mem_new(cpu_env,
+                                  offsetof(CPUPPCState, ca32), "CA32");
 
     cpu_reserve = tcg_global_mem_new(cpu_env,
                                      offsetof(CPUPPCState, reserve_addr),
@@ -3703,7 +3707,7 @@  static void gen_tdi(DisasContext *ctx)
 
 /***                          Processor control                            ***/
 
-static void gen_read_xer(TCGv dst)
+static void gen_read_xer(DisasContext *ctx, TCGv dst)
 {
     TCGv t0 = tcg_temp_new();
     TCGv t1 = tcg_temp_new();
@@ -3715,15 +3719,30 @@  static void gen_read_xer(TCGv dst)
     tcg_gen_or_tl(t0, t0, t1);
     tcg_gen_or_tl(dst, dst, t2);
     tcg_gen_or_tl(dst, dst, t0);
+    if (is_isa300(ctx)) {
+        tcg_gen_shli_tl(t0, cpu_ov32, XER_OV32);
+        tcg_gen_or_tl(dst, dst, t0);
+        tcg_gen_shli_tl(t0, cpu_ca32, XER_CA32);
+        tcg_gen_or_tl(dst, dst, t0);
+    }
     tcg_temp_free(t0);
     tcg_temp_free(t1);
     tcg_temp_free(t2);
 }
 
-static void gen_write_xer(TCGv src)
+static void gen_write_xer(DisasContext *ctx, TCGv src)
 {
-    tcg_gen_andi_tl(cpu_xer, src,
-                    ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
+    if (is_isa300(ctx)) {
+        tcg_gen_andi_tl(cpu_xer, src,
+                        ~((1u << XER_SO) |
+                          (1u << XER_OV) | (1u << XER_OV32) |
+                          (1u << XER_CA) | (1u << XER_CA32)));
+        tcg_gen_extract_tl(cpu_ov32, src, XER_OV32, 1);
+        tcg_gen_extract_tl(cpu_ca32, src, XER_CA32, 1);
+    } else {
+        tcg_gen_andi_tl(cpu_xer, src,
+                        ~((1u << XER_SO) | (1u << XER_OV) | (1u << XER_CA)));
+    }
     tcg_gen_extract_tl(cpu_so, src, XER_SO, 1);
     tcg_gen_extract_tl(cpu_ov, src, XER_OV, 1);
     tcg_gen_extract_tl(cpu_ca, src, XER_CA, 1);
diff --git a/target/ppc/translate_init.c b/target/ppc/translate_init.c
index be35cbd..eb667bb 100644
--- a/target/ppc/translate_init.c
+++ b/target/ppc/translate_init.c
@@ -107,12 +107,12 @@  static void spr_access_nop(DisasContext *ctx, int sprn, int gprn)
 /* XER */
 static void spr_read_xer (DisasContext *ctx, int gprn, int sprn)
 {
-    gen_read_xer(cpu_gpr[gprn]);
+    gen_read_xer(ctx, cpu_gpr[gprn]);
 }
 
 static void spr_write_xer (DisasContext *ctx, int sprn, int gprn)
 {
-    gen_write_xer(cpu_gpr[gprn]);
+    gen_write_xer(ctx, cpu_gpr[gprn]);
 }
 
 /* LR */