Message ID | 20240215192823.729209-3-max.chou@sifive.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | Improve the performance of RISC-V vector unit-stride ld/st instructions | expand |
On 2/15/24 09:28, Max Chou wrote: > If there are not any QEMU plugin memory callback functions, checking > before calling the qemu_plugin_vcpu_mem_cb function can reduce the > function call overhead. > > Signed-off-by: Max Chou <max.chou@sifive.com> > --- > accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++---------- > 1 file changed, 30 insertions(+), 10 deletions(-) > > diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc > index c82048e377e..bf24986c562 100644 > --- a/accel/tcg/ldst_common.c.inc > +++ b/accel/tcg/ldst_common.c.inc > @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); > ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } Rather than repeating N times, modify plugin_load_cb and plugin_store_cb just above to avoid the call to cpu_plugin_mem_cbs_enabled(). I expect the compiler is inlining those functions already. r~
On 2/15/24 16:28, Max Chou wrote: > If there are not any QEMU plugin memory callback functions, checking > before calling the qemu_plugin_vcpu_mem_cb function can reduce the > function call overhead. > > Signed-off-by: Max Chou <max.chou@sifive.com> > --- This was in my TODO list for some time. Thanks for looking it up. Can't we avoid all callbacks, not just qemu_plugin_vcpu_mem_cb, if there's no plugin loaded? The performance increase when building with --disable-plugins shouldn't be a thing - if the user isn't using plug-ins it should have a penalty to it. Thanks, Daniel > accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++---------- > 1 file changed, 30 insertions(+), 10 deletions(-) > > diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc > index c82048e377e..bf24986c562 100644 > --- a/accel/tcg/ldst_common.c.inc > +++ b/accel/tcg/ldst_common.c.inc > @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); > ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } > > @@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr, > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); > ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } > > @@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr, > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); > ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } > > @@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr, > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); > ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } > > @@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, > > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); > ret = do_ld16_mmu(env_cpu(env), addr, oi, ra); > - plugin_load_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_load_cb(env, addr, oi); > + } > return ret; > } > > @@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, > MemOpIdx oi, uintptr_t retaddr) > { > helper_stb_mmu(env, addr, val, oi, retaddr); > - plugin_store_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_store_cb(env, addr, oi); > + } > } > > void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, > @@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, > { > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); > do_st2_mmu(env_cpu(env), addr, val, oi, retaddr); > - plugin_store_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_store_cb(env, addr, oi); > + } > } > > void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, > @@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, > { > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); > do_st4_mmu(env_cpu(env), addr, val, oi, retaddr); > - plugin_store_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_store_cb(env, addr, oi); > + } > } > > void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, > @@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, > { > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); > do_st8_mmu(env_cpu(env), addr, val, oi, retaddr); > - plugin_store_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_store_cb(env, addr, oi); > + } > } > > void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val, > @@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val, > { > tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); > do_st16_mmu(env_cpu(env), addr, val, oi, retaddr); > - plugin_store_cb(env, addr, oi); > + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { > + plugin_store_cb(env, addr, oi); > + } > } > > /*
Hi Richard, Thank you for the suggestion. I'll do a v2 with this. Thanks, Max On 2024/2/16 4:03 AM, Richard Henderson wrote: > On 2/15/24 09:28, Max Chou wrote: >> If there are not any QEMU plugin memory callback functions, checking >> before calling the qemu_plugin_vcpu_mem_cb function can reduce the >> function call overhead. >> >> Signed-off-by: Max Chou <max.chou@sifive.com> >> --- >> accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++---------- >> 1 file changed, 30 insertions(+), 10 deletions(-) >> >> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc >> index c82048e377e..bf24986c562 100644 >> --- a/accel/tcg/ldst_common.c.inc >> +++ b/accel/tcg/ldst_common.c.inc >> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr >> addr, MemOpIdx oi, uintptr_t ra) >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); >> ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } > > Rather than repeating N times, modify plugin_load_cb and > plugin_store_cb just above to avoid the call to > cpu_plugin_mem_cbs_enabled(). I expect the compiler is inlining those > functions already. > > > r~
Hi Daniel, I think the the overhead from the plugin callbacks depending on the type of plugin callbacks. Because there is only qemu_plugin_vcpu_mem_cb in the perf report of the glibc memcpy benchtest, so I just looked it up for the the memory callbacks in this RFC. I'll try to get more experiment results to check the status of other plugin callbacks. Thanks, Max On 2024/2/16 4:21 AM, Daniel Henrique Barboza wrote: > > > On 2/15/24 16:28, Max Chou wrote: >> If there are not any QEMU plugin memory callback functions, checking >> before calling the qemu_plugin_vcpu_mem_cb function can reduce the >> function call overhead. >> >> Signed-off-by: Max Chou <max.chou@sifive.com> >> --- > > This was in my TODO list for some time. Thanks for looking it up. > > Can't we avoid all callbacks, not just qemu_plugin_vcpu_mem_cb, if > there's no > plugin loaded? The performance increase when building with > --disable-plugins > shouldn't be a thing - if the user isn't using plug-ins it should have a > penalty to it. > > Thanks, > > Daniel > >> accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++---------- >> 1 file changed, 30 insertions(+), 10 deletions(-) >> >> diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc >> index c82048e377e..bf24986c562 100644 >> --- a/accel/tcg/ldst_common.c.inc >> +++ b/accel/tcg/ldst_common.c.inc >> @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr >> addr, MemOpIdx oi, uintptr_t ra) >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); >> ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } >> @@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr >> addr, >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); >> ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } >> @@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr >> addr, >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); >> ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } >> @@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr >> addr, >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); >> ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } >> @@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr >> addr, >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); >> ret = do_ld16_mmu(env_cpu(env), addr, oi, ra); >> - plugin_load_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_load_cb(env, addr, oi); >> + } >> return ret; >> } >> @@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr >> addr, uint8_t val, >> MemOpIdx oi, uintptr_t retaddr) >> { >> helper_stb_mmu(env, addr, val, oi, retaddr); >> - plugin_store_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_store_cb(env, addr, oi); >> + } >> } >> void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, >> @@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, >> uint16_t val, >> { >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); >> do_st2_mmu(env_cpu(env), addr, val, oi, retaddr); >> - plugin_store_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_store_cb(env, addr, oi); >> + } >> } >> void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, >> @@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, >> uint32_t val, >> { >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); >> do_st4_mmu(env_cpu(env), addr, val, oi, retaddr); >> - plugin_store_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_store_cb(env, addr, oi); >> + } >> } >> void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, >> @@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, >> uint64_t val, >> { >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); >> do_st8_mmu(env_cpu(env), addr, val, oi, retaddr); >> - plugin_store_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_store_cb(env, addr, oi); >> + } >> } >> void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val, >> @@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr >> addr, Int128 val, >> { >> tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); >> do_st16_mmu(env_cpu(env), addr, val, oi, retaddr); >> - plugin_store_cb(env, addr, oi); >> + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { >> + plugin_store_cb(env, addr, oi); >> + } >> } >> /*
diff --git a/accel/tcg/ldst_common.c.inc b/accel/tcg/ldst_common.c.inc index c82048e377e..bf24986c562 100644 --- a/accel/tcg/ldst_common.c.inc +++ b/accel/tcg/ldst_common.c.inc @@ -134,7 +134,9 @@ uint8_t cpu_ldb_mmu(CPUArchState *env, abi_ptr addr, MemOpIdx oi, uintptr_t ra) tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_UB); ret = do_ld1_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); - plugin_load_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_load_cb(env, addr, oi); + } return ret; } @@ -145,7 +147,9 @@ uint16_t cpu_ldw_mmu(CPUArchState *env, abi_ptr addr, tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); ret = do_ld2_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); - plugin_load_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_load_cb(env, addr, oi); + } return ret; } @@ -156,7 +160,9 @@ uint32_t cpu_ldl_mmu(CPUArchState *env, abi_ptr addr, tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); ret = do_ld4_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); - plugin_load_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_load_cb(env, addr, oi); + } return ret; } @@ -167,7 +173,9 @@ uint64_t cpu_ldq_mmu(CPUArchState *env, abi_ptr addr, tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); ret = do_ld8_mmu(env_cpu(env), addr, oi, ra, MMU_DATA_LOAD); - plugin_load_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_load_cb(env, addr, oi); + } return ret; } @@ -178,7 +186,9 @@ Int128 cpu_ld16_mmu(CPUArchState *env, abi_ptr addr, tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); ret = do_ld16_mmu(env_cpu(env), addr, oi, ra); - plugin_load_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_load_cb(env, addr, oi); + } return ret; } @@ -195,7 +205,9 @@ void cpu_stb_mmu(CPUArchState *env, abi_ptr addr, uint8_t val, MemOpIdx oi, uintptr_t retaddr) { helper_stb_mmu(env, addr, val, oi, retaddr); - plugin_store_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_store_cb(env, addr, oi); + } } void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, @@ -203,7 +215,9 @@ void cpu_stw_mmu(CPUArchState *env, abi_ptr addr, uint16_t val, { tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_16); do_st2_mmu(env_cpu(env), addr, val, oi, retaddr); - plugin_store_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_store_cb(env, addr, oi); + } } void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, @@ -211,7 +225,9 @@ void cpu_stl_mmu(CPUArchState *env, abi_ptr addr, uint32_t val, { tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_32); do_st4_mmu(env_cpu(env), addr, val, oi, retaddr); - plugin_store_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_store_cb(env, addr, oi); + } } void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, @@ -219,7 +235,9 @@ void cpu_stq_mmu(CPUArchState *env, abi_ptr addr, uint64_t val, { tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_64); do_st8_mmu(env_cpu(env), addr, val, oi, retaddr); - plugin_store_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_store_cb(env, addr, oi); + } } void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val, @@ -227,7 +245,9 @@ void cpu_st16_mmu(CPUArchState *env, abi_ptr addr, Int128 val, { tcg_debug_assert((get_memop(oi) & MO_SIZE) == MO_128); do_st16_mmu(env_cpu(env), addr, val, oi, retaddr); - plugin_store_cb(env, addr, oi); + if (cpu_plugin_mem_cbs_enabled(env_cpu(env))) { + plugin_store_cb(env, addr, oi); + } } /*
If there are not any QEMU plugin memory callback functions, checking before calling the qemu_plugin_vcpu_mem_cb function can reduce the function call overhead. Signed-off-by: Max Chou <max.chou@sifive.com> --- accel/tcg/ldst_common.c.inc | 40 +++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-)