diff mbox series

[RFC,v5,2/5] target/riscv: rvv: Provide a fast path using direct access to host ram for unmasked unit-stride load/store

Message ID 20240717133936.713642-3-max.chou@sifive.com (mailing list archive)
State New, archived
Headers show
Series Improve the performance of RISC-V vector unit-stride/whole register ld/st instructions | expand

Commit Message

Max Chou July 17, 2024, 1:39 p.m. UTC
This commit references the sve_ldN_r/sve_stN_r helper functions in ARM
target to optimize the vector unmasked unit-stride load/store
implementation with following optimizations:

* Get the page boundary
* Probing pages/resolving host memory address at the beginning if
  possible
* Provide new interface to direct access host memory
* Switch to the original slow TLB access when cross page element/violate
  page permission/violate pmp/watchpoints in page

The original element load/store interface is replaced by the new element
load/store functions with _tlb & _host postfix that means doing the
element load/store through the original softmmu flow and the direct
access host memory flow.

Signed-off-by: Max Chou <max.chou@sifive.com>
---
 target/riscv/vector_helper.c | 361 +++++++++++++++++++++--------------
 1 file changed, 220 insertions(+), 141 deletions(-)

Comments

Richard Henderson July 25, 2024, 5:51 a.m. UTC | #1
On 7/17/24 23:39, Max Chou wrote:
> @@ -199,7 +212,7 @@ static void
>   vext_ldst_stride(void *vd, void *v0, target_ulong base,
>                    target_ulong stride, CPURISCVState *env,
>                    uint32_t desc, uint32_t vm,
> -                 vext_ldst_elem_fn *ldst_elem,
> +                 vext_ldst_elem_fn_tlb * ldst_elem,

Extra space: "type *var"

>                    uint32_t log2_esz, uintptr_t ra)
>   {
>       uint32_t i, k;
> @@ -221,7 +234,8 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
>                   continue;
>               }
>               target_ulong addr = base + stride * i + (k << log2_esz);
> -            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
> +            ldst_elem(env, adjust_addr(env, addr),
> +                      (i + k * max_elems) << log2_esz, vd, ra);

Is this some sort of bug fix?  It doesn't seem related...
If it is a bug fix, it should be a separate patch.

>   /*
>    * unit-stride: access elements stored contiguously in memory
>    */
>   
>   /* unmasked unit-stride load and store operation */
> +static void
> +vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
> +                  uint32_t elems, uint32_t nf, uint32_t max_elems,
> +                  uint32_t log2_esz, bool is_load,
> +                  vext_ldst_elem_fn_tlb *ldst_tlb,
> +                  vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
> +{
> +    void *host;
> +    int i, k, flags;
> +    uint32_t esz = 1 << log2_esz;
> +    uint32_t size = (elems * nf) << log2_esz;
> +    uint32_t evl = env->vstart + elems;
> +    int mmu_index = riscv_env_mmu_index(env, false);
> +    MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;

You may want to pass in mmu_index, so that it is computed once in the caller.

> +
> +    /* Check page permission/pmp/watchpoint/etc. */
> +    flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
> +                               mmu_index, true, &host, ra);
> +
> +    if (host && flags == 0) {

If flags == 0, host will always be non-null.
You only need flags == 0.

>   static void
>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> -             vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
> -             uintptr_t ra)
> +             vext_ldst_elem_fn_tlb *ldst_tlb,
> +             vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
> +             uint32_t evl, uintptr_t ra, bool is_load)
>   {
> -    uint32_t i, k;
> +    uint32_t k;
> +    target_ulong page_split, elems, addr;
>       uint32_t nf = vext_nf(desc);
>       uint32_t max_elems = vext_max_elems(desc, log2_esz);
>       uint32_t esz = 1 << log2_esz;
> +    uint32_t msize = nf * esz;
>   
>       VSTART_CHECK_EARLY_EXIT(env);
>   
> -    /* load bytes from guest memory */
> -    for (i = env->vstart; i < evl; env->vstart = ++i) {
> -        k = 0;
> -        while (k < nf) {
> -            target_ulong addr = base + ((i * nf + k) << log2_esz);
> -            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
> -            k++;
> +    while (env->vstart < evl) {

VSTART_CHECK_EARLY_EXIT has taken care of this condition for the first page.
We know that one contiguous operation can only consume 1024 bytes, so cannot cross two 
pages.  Therefore this loop executes exactly once or twice.

I think it would be better to unroll this by hand:

     calc page range
     if (likely(elems)) {
         vext_page_ldst_us(... elems ...);
     }
     if (unlikely(env->vstart < evl)) {
         if (unlikely(page_split % msize)) {
            ...
         }
         vext_page_ldst_us(... evl - vstart ...);
     }

> +        /* Calculate page range */
> +        addr = base + ((env->vstart * nf) << log2_esz);
> +        page_split = -(addr | TARGET_PAGE_MASK);
> +        /* Get number of elements */
> +        elems = page_split / msize;
> +        if (unlikely(env->vstart + elems >= evl)) {
> +            elems = evl - env->vstart;
> +        }
> +
> +        /* Load/store elements in page */
> +        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
> +                          is_load, ldst_tlb, ldst_host, ra);
> +
> +        /* Cross page element */
> +        if (unlikely((page_split % msize) != 0 && (env->vstart + 1) < evl)) {
> +            for (k = 0; k < nf; k++) {
> +                addr = base + ((env->vstart * nf + k) << log2_esz);
> +                ldst_tlb(env, adjust_addr(env, addr),
> +                         (k * max_elems + env->vstart) << log2_esz, vd, ra);
> +            }
> +            env->vstart++;
>           }
>       }
> -    env->vstart = 0;
>   
> +    env->vstart = 0;
>       vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
>   }


r~
Max Chou July 30, 2024, 2:24 p.m. UTC | #2
On 2024/7/25 1:51 PM, Richard Henderson wrote:
> On 7/17/24 23:39, Max Chou wrote:
>> @@ -199,7 +212,7 @@ static void
>>   vext_ldst_stride(void *vd, void *v0, target_ulong base,
>>                    target_ulong stride, CPURISCVState *env,
>>                    uint32_t desc, uint32_t vm,
>> -                 vext_ldst_elem_fn *ldst_elem,
>> +                 vext_ldst_elem_fn_tlb * ldst_elem,
>
> Extra space: "type *var"
I will fix this part at v6.
Thanks.

>
>>                    uint32_t log2_esz, uintptr_t ra)
>>   {
>>       uint32_t i, k;
>> @@ -221,7 +234,8 @@ vext_ldst_stride(void *vd, void *v0, target_ulong 
>> base,
>>                   continue;
>>               }
>>               target_ulong addr = base + stride * i + (k << log2_esz);
>> -            ldst_elem(env, adjust_addr(env, addr), i + k * 
>> max_elems, vd, ra);
>> +            ldst_elem(env, adjust_addr(env, addr),
>> +                      (i + k * max_elems) << log2_esz, vd, ra);
>
> Is this some sort of bug fix?  It doesn't seem related...
> If it is a bug fix, it should be a separate patch.
This modification is caused by replacing the idx by byte offset for the 
vector register accessing flow in the basic GEN_VEXT_LD/ST_ELEMENT macros.
But I think that it's not a good idea now, it will get unexpected result 
when the endian between host and guest are different.

I'll fix this part at v6.
>
>>   /*
>>    * unit-stride: access elements stored contiguously in memory
>>    */
>>     /* unmasked unit-stride load and store operation */
>> +static void
>> +vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
>> +                  uint32_t elems, uint32_t nf, uint32_t max_elems,
>> +                  uint32_t log2_esz, bool is_load,
>> +                  vext_ldst_elem_fn_tlb *ldst_tlb,
>> +                  vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
>> +{
>> +    void *host;
>> +    int i, k, flags;
>> +    uint32_t esz = 1 << log2_esz;
>> +    uint32_t size = (elems * nf) << log2_esz;
>> +    uint32_t evl = env->vstart + elems;
>> +    int mmu_index = riscv_env_mmu_index(env, false);
>> +    MMUAccessType access_type = is_load ? MMU_DATA_LOAD : 
>> MMU_DATA_STORE;
>
> You may want to pass in mmu_index, so that it is computed once in the 
> caller.
Thanks for the suggestion.
I'll try to pass in mmu_index at v6.

>
>> +
>> +    /* Check page permission/pmp/watchpoint/etc. */
>> +    flags = probe_access_flags(env, adjust_addr(env, addr), size, 
>> access_type,
>> +                               mmu_index, true, &host, ra);
>> +
>> +    if (host && flags == 0) {
>
> If flags == 0, host will always be non-null.
> You only need flags == 0.
Thanks for the suggestion.
I'll update this part at v6.

>
>>   static void
>>   vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, 
>> uint32_t desc,
>> -             vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, 
>> uint32_t evl,
>> -             uintptr_t ra)
>> +             vext_ldst_elem_fn_tlb *ldst_tlb,
>> +             vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
>> +             uint32_t evl, uintptr_t ra, bool is_load)
>>   {
>> -    uint32_t i, k;
>> +    uint32_t k;
>> +    target_ulong page_split, elems, addr;
>>       uint32_t nf = vext_nf(desc);
>>       uint32_t max_elems = vext_max_elems(desc, log2_esz);
>>       uint32_t esz = 1 << log2_esz;
>> +    uint32_t msize = nf * esz;
>>         VSTART_CHECK_EARLY_EXIT(env);
>>   -    /* load bytes from guest memory */
>> -    for (i = env->vstart; i < evl; env->vstart = ++i) {
>> -        k = 0;
>> -        while (k < nf) {
>> -            target_ulong addr = base + ((i * nf + k) << log2_esz);
>> -            ldst_elem(env, adjust_addr(env, addr), i + k * 
>> max_elems, vd, ra);
>> -            k++;
>> +    while (env->vstart < evl) {
>
> VSTART_CHECK_EARLY_EXIT has taken care of this condition for the first 
> page.
> We know that one contiguous operation can only consume 1024 bytes, so 
> cannot cross two pages.  Therefore this loop executes exactly once or 
> twice.
>
> I think it would be better to unroll this by hand:
>
>     calc page range
>     if (likely(elems)) {
>         vext_page_ldst_us(... elems ...);
>     }
>     if (unlikely(env->vstart < evl)) {
>         if (unlikely(page_split % msize)) {
>            ...
>         }
>         vext_page_ldst_us(... evl - vstart ...);
>     }
Yes, one contiguous operation can only consume 1024 bytes in vector 
unit-stride ld/st. It's a good idea to unroll it here.
I'll unroll this part at v6.
Thanks for the suggestion.

I'll try to extend the original loop implementation to other vector 
ld/st instructions that may cross multiple pages in the future.
>
>> +        /* Calculate page range */
>> +        addr = base + ((env->vstart * nf) << log2_esz);
>> +        page_split = -(addr | TARGET_PAGE_MASK);
>> +        /* Get number of elements */
>> +        elems = page_split / msize;
>> +        if (unlikely(env->vstart + elems >= evl)) {
>> +            elems = evl - env->vstart;
>> +        }
>> +
>> +        /* Load/store elements in page */
>> +        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, 
>> log2_esz,
>> +                          is_load, ldst_tlb, ldst_host, ra);
>> +
>> +        /* Cross page element */
>> +        if (unlikely((page_split % msize) != 0 && (env->vstart + 1) 
>> < evl)) {
>> +            for (k = 0; k < nf; k++) {
>> +                addr = base + ((env->vstart * nf + k) << log2_esz);
>> +                ldst_tlb(env, adjust_addr(env, addr),
>> +                         (k * max_elems + env->vstart) << log2_esz, 
>> vd, ra);
>> +            }
>> +            env->vstart++;
>>           }
>>       }
>> -    env->vstart = 0;
>>   +    env->vstart = 0;
>>       vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
>>   }
>
>
> r~
diff mbox series

Patch

diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 4d72eb74d3a..23396a1b750 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -146,34 +146,47 @@  static inline void vext_set_elem_mask(void *v0, int index,
 }
 
 /* elements operations for load and store */
-typedef void vext_ldst_elem_fn(CPURISCVState *env, abi_ptr addr,
-                               uint32_t idx, void *vd, uintptr_t retaddr);
+typedef void vext_ldst_elem_fn_tlb(CPURISCVState *env, abi_ptr addr,
+                                   uint32_t idx, void *vd, uintptr_t retaddr);
+typedef void vext_ldst_elem_fn_host(void *vd, uint32_t idx, void *host);
 
-#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)            \
-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE *cur = ((ETYPE *)vd + H(idx));                   \
-    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);      \
-}                                                          \
-
-GEN_VEXT_LD_ELEM(lde_b, int8_t,  H1, ldsb)
-GEN_VEXT_LD_ELEM(lde_h, int16_t, H2, ldsw)
-GEN_VEXT_LD_ELEM(lde_w, int32_t, H4, ldl)
-GEN_VEXT_LD_ELEM(lde_d, int64_t, H8, ldq)
-
-#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)            \
-static void NAME(CPURISCVState *env, abi_ptr addr,         \
-                 uint32_t idx, void *vd, uintptr_t retaddr)\
-{                                                          \
-    ETYPE data = *((ETYPE *)vd + H(idx));                  \
-    cpu_##STSUF##_data_ra(env, addr, data, retaddr);       \
+#define GEN_VEXT_LD_ELEM(NAME, ETYPE, H, LDSUF)                         \
+static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
+                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
+{                                                                       \
+    ETYPE *cur = vd + byte_off;                                         \
+    *cur = cpu_##LDSUF##_data_ra(env, addr, retaddr);                   \
+}                                                                       \
+                                                                        \
+static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
+{                                                                       \
+    ETYPE val = LDSUF##_p(host);                                        \
+    *(ETYPE *)(vd + byte_off) = val;                                    \
+}
+
+GEN_VEXT_LD_ELEM(lde_b, uint8_t,  H1, ldub)
+GEN_VEXT_LD_ELEM(lde_h, uint16_t, H2, lduw)
+GEN_VEXT_LD_ELEM(lde_w, uint32_t, H4, ldl)
+GEN_VEXT_LD_ELEM(lde_d, uint64_t, H8, ldq)
+
+#define GEN_VEXT_ST_ELEM(NAME, ETYPE, H, STSUF)                         \
+static void NAME##_tlb(CPURISCVState *env, abi_ptr addr,                \
+                       uint32_t byte_off, void *vd, uintptr_t retaddr)  \
+{                                                                       \
+    ETYPE data = *(ETYPE *)(vd + byte_off);                             \
+    cpu_##STSUF##_data_ra(env, addr, data, retaddr);                    \
+}                                                                       \
+                                                                        \
+static void NAME##_host(void *vd, uint32_t byte_off, void *host)        \
+{                                                                       \
+    ETYPE val = *(ETYPE *)(vd + byte_off);                              \
+    STSUF##_p(host, val);                                               \
 }
 
-GEN_VEXT_ST_ELEM(ste_b, int8_t,  H1, stb)
-GEN_VEXT_ST_ELEM(ste_h, int16_t, H2, stw)
-GEN_VEXT_ST_ELEM(ste_w, int32_t, H4, stl)
-GEN_VEXT_ST_ELEM(ste_d, int64_t, H8, stq)
+GEN_VEXT_ST_ELEM(ste_b, uint8_t,  H1, stb)
+GEN_VEXT_ST_ELEM(ste_h, uint16_t, H2, stw)
+GEN_VEXT_ST_ELEM(ste_w, uint32_t, H4, stl)
+GEN_VEXT_ST_ELEM(ste_d, uint64_t, H8, stq)
 
 static void vext_set_tail_elems_1s(target_ulong vl, void *vd,
                                    uint32_t desc, uint32_t nf,
@@ -199,7 +212,7 @@  static void
 vext_ldst_stride(void *vd, void *v0, target_ulong base,
                  target_ulong stride, CPURISCVState *env,
                  uint32_t desc, uint32_t vm,
-                 vext_ldst_elem_fn *ldst_elem,
+                 vext_ldst_elem_fn_tlb * ldst_elem,
                  uint32_t log2_esz, uintptr_t ra)
 {
     uint32_t i, k;
@@ -221,7 +234,8 @@  vext_ldst_stride(void *vd, void *v0, target_ulong base,
                 continue;
             }
             target_ulong addr = base + stride * i + (k << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (i + k * max_elems) << log2_esz, vd, ra);
             k++;
         }
     }
@@ -240,10 +254,10 @@  void HELPER(NAME)(void *vd, void * v0, target_ulong base,               \
                      ctzl(sizeof(ETYPE)), GETPC());                     \
 }
 
-GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h)
-GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w)
-GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d)
+GEN_VEXT_LD_STRIDE(vlse8_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LD_STRIDE(vlse16_v, int16_t, lde_h_tlb)
+GEN_VEXT_LD_STRIDE(vlse32_v, int32_t, lde_w_tlb)
+GEN_VEXT_LD_STRIDE(vlse64_v, int64_t, lde_d_tlb)
 
 #define GEN_VEXT_ST_STRIDE(NAME, ETYPE, STORE_FN)                       \
 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
@@ -255,39 +269,100 @@  void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
                      ctzl(sizeof(ETYPE)), GETPC());                     \
 }
 
-GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b)
-GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h)
-GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w)
-GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d)
+GEN_VEXT_ST_STRIDE(vsse8_v,  int8_t,  ste_b_tlb)
+GEN_VEXT_ST_STRIDE(vsse16_v, int16_t, ste_h_tlb)
+GEN_VEXT_ST_STRIDE(vsse32_v, int32_t, ste_w_tlb)
+GEN_VEXT_ST_STRIDE(vsse64_v, int64_t, ste_d_tlb)
 
 /*
  * unit-stride: access elements stored contiguously in memory
  */
 
 /* unmasked unit-stride load and store operation */
+static void
+vext_page_ldst_us(CPURISCVState *env, void *vd, target_ulong addr,
+                  uint32_t elems, uint32_t nf, uint32_t max_elems,
+                  uint32_t log2_esz, bool is_load,
+                  vext_ldst_elem_fn_tlb *ldst_tlb,
+                  vext_ldst_elem_fn_host *ldst_host, uintptr_t ra)
+{
+    void *host;
+    int i, k, flags;
+    uint32_t esz = 1 << log2_esz;
+    uint32_t size = (elems * nf) << log2_esz;
+    uint32_t evl = env->vstart + elems;
+    int mmu_index = riscv_env_mmu_index(env, false);
+    MMUAccessType access_type = is_load ? MMU_DATA_LOAD : MMU_DATA_STORE;
+
+    /* Check page permission/pmp/watchpoint/etc. */
+    flags = probe_access_flags(env, adjust_addr(env, addr), size, access_type,
+                               mmu_index, true, &host, ra);
+
+    if (host && flags == 0) {
+        for (i = env->vstart; i < evl; ++i) {
+            k = 0;
+            while (k < nf) {
+                ldst_host(vd, (i + k * max_elems) << log2_esz, host);
+                host += esz;
+                k++;
+            }
+        }
+        env->vstart += elems;
+    } else {
+        /* load bytes from guest memory */
+        for (i = env->vstart; i < evl; env->vstart = ++i) {
+            k = 0;
+            while (k < nf) {
+                ldst_tlb(env, adjust_addr(env, addr),
+                         (i + k * max_elems) << log2_esz, vd, ra);
+                addr += esz;
+                k++;
+            }
+        }
+    }
+}
+
 static void
 vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
-             vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uint32_t evl,
-             uintptr_t ra)
+             vext_ldst_elem_fn_tlb *ldst_tlb,
+             vext_ldst_elem_fn_host *ldst_host, uint32_t log2_esz,
+             uint32_t evl, uintptr_t ra, bool is_load)
 {
-    uint32_t i, k;
+    uint32_t k;
+    target_ulong page_split, elems, addr;
     uint32_t nf = vext_nf(desc);
     uint32_t max_elems = vext_max_elems(desc, log2_esz);
     uint32_t esz = 1 << log2_esz;
+    uint32_t msize = nf * esz;
 
     VSTART_CHECK_EARLY_EXIT(env);
 
-    /* load bytes from guest memory */
-    for (i = env->vstart; i < evl; env->vstart = ++i) {
-        k = 0;
-        while (k < nf) {
-            target_ulong addr = base + ((i * nf + k) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
-            k++;
+    while (env->vstart < evl) {
+        /* Calculate page range */
+        addr = base + ((env->vstart * nf) << log2_esz);
+        page_split = -(addr | TARGET_PAGE_MASK);
+        /* Get number of elements */
+        elems = page_split / msize;
+        if (unlikely(env->vstart + elems >= evl)) {
+            elems = evl - env->vstart;
+        }
+
+        /* Load/store elements in page */
+        vext_page_ldst_us(env, vd, addr, elems, nf, max_elems, log2_esz,
+                          is_load, ldst_tlb, ldst_host, ra);
+
+        /* Cross page element */
+        if (unlikely((page_split % msize) != 0 && (env->vstart + 1) < evl)) {
+            for (k = 0; k < nf; k++) {
+                addr = base + ((env->vstart * nf + k) << log2_esz);
+                ldst_tlb(env, adjust_addr(env, addr),
+                         (k * max_elems + env->vstart) << log2_esz, vd, ra);
+            }
+            env->vstart++;
         }
     }
-    env->vstart = 0;
 
+    env->vstart = 0;
     vext_set_tail_elems_1s(evl, vd, desc, nf, esz, max_elems);
 }
 
@@ -296,47 +371,47 @@  vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
  * stride, stride = NF * sizeof (ETYPE)
  */
 
-#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN)                            \
-void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,         \
-                         CPURISCVState *env, uint32_t desc)             \
-{                                                                       \
-    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));             \
-    vext_ldst_stride(vd, v0, base, stride, env, desc, false, LOAD_FN,   \
-                     ctzl(sizeof(ETYPE)), GETPC());                     \
-}                                                                       \
-                                                                        \
-void HELPER(NAME)(void *vd, void *v0, target_ulong base,                \
-                  CPURISCVState *env, uint32_t desc)                    \
-{                                                                       \
-    vext_ldst_us(vd, base, env, desc, LOAD_FN,                          \
-                 ctzl(sizeof(ETYPE)), env->vl, GETPC());                \
+#define GEN_VEXT_LD_US(NAME, ETYPE, LOAD_FN_TLB, LOAD_FN_HOST)      \
+void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,     \
+                         CPURISCVState *env, uint32_t desc)         \
+{                                                                   \
+    uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));         \
+    vext_ldst_stride(vd, v0, base, stride, env, desc, false,        \
+                     LOAD_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());    \
+}                                                                   \
+                                                                    \
+void HELPER(NAME)(void *vd, void *v0, target_ulong base,            \
+                  CPURISCVState *env, uint32_t desc)                \
+{                                                                   \
+    vext_ldst_us(vd, base, env, desc, LOAD_FN_TLB, LOAD_FN_HOST,    \
+                 ctzl(sizeof(ETYPE)), env->vl, GETPC(), true);      \
 }
 
-GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_US(vle16_v, int16_t, lde_h)
-GEN_VEXT_LD_US(vle32_v, int32_t, lde_w)
-GEN_VEXT_LD_US(vle64_v, int64_t, lde_d)
+GEN_VEXT_LD_US(vle8_v,  int8_t,  lde_b_tlb, lde_b_host)
+GEN_VEXT_LD_US(vle16_v, int16_t, lde_h_tlb, lde_h_host)
+GEN_VEXT_LD_US(vle32_v, int32_t, lde_w_tlb, lde_w_host)
+GEN_VEXT_LD_US(vle64_v, int64_t, lde_d_tlb, lde_d_host)
 
-#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN)                            \
+#define GEN_VEXT_ST_US(NAME, ETYPE, STORE_FN_TLB, STORE_FN_HOST)         \
 void HELPER(NAME##_mask)(void *vd, void *v0, target_ulong base,          \
                          CPURISCVState *env, uint32_t desc)              \
 {                                                                        \
     uint32_t stride = vext_nf(desc) << ctzl(sizeof(ETYPE));              \
-    vext_ldst_stride(vd, v0, base, stride, env, desc, false, STORE_FN,   \
-                     ctzl(sizeof(ETYPE)), GETPC());                      \
+    vext_ldst_stride(vd, v0, base, stride, env, desc, false,             \
+                     STORE_FN_TLB, ctzl(sizeof(ETYPE)), GETPC());        \
 }                                                                        \
                                                                          \
 void HELPER(NAME)(void *vd, void *v0, target_ulong base,                 \
                   CPURISCVState *env, uint32_t desc)                     \
 {                                                                        \
-    vext_ldst_us(vd, base, env, desc, STORE_FN,                          \
-                 ctzl(sizeof(ETYPE)), env->vl, GETPC());                 \
+    vext_ldst_us(vd, base, env, desc, STORE_FN_TLB, STORE_FN_HOST,       \
+                 ctzl(sizeof(ETYPE)), env->vl, GETPC(), false);          \
 }
 
-GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b)
-GEN_VEXT_ST_US(vse16_v, int16_t, ste_h)
-GEN_VEXT_ST_US(vse32_v, int32_t, ste_w)
-GEN_VEXT_ST_US(vse64_v, int64_t, ste_d)
+GEN_VEXT_ST_US(vse8_v,  int8_t,  ste_b_tlb, ste_b_host)
+GEN_VEXT_ST_US(vse16_v, int16_t, ste_h_tlb, ste_h_host)
+GEN_VEXT_ST_US(vse32_v, int32_t, ste_w_tlb, ste_w_host)
+GEN_VEXT_ST_US(vse64_v, int64_t, ste_d_tlb, ste_d_host)
 
 /*
  * unit stride mask load and store, EEW = 1
@@ -346,8 +421,8 @@  void HELPER(vlm_v)(void *vd, void *v0, target_ulong base,
 {
     /* evl = ceil(vl/8) */
     uint8_t evl = (env->vl + 7) >> 3;
-    vext_ldst_us(vd, base, env, desc, lde_b,
-                 0, evl, GETPC());
+    vext_ldst_us(vd, base, env, desc, lde_b_tlb, lde_b_host,
+                 0, evl, GETPC(), true);
 }
 
 void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
@@ -355,8 +430,8 @@  void HELPER(vsm_v)(void *vd, void *v0, target_ulong base,
 {
     /* evl = ceil(vl/8) */
     uint8_t evl = (env->vl + 7) >> 3;
-    vext_ldst_us(vd, base, env, desc, ste_b,
-                 0, evl, GETPC());
+    vext_ldst_us(vd, base, env, desc, ste_b_tlb, ste_b_host,
+                 0, evl, GETPC(), false);
 }
 
 /*
@@ -381,7 +456,7 @@  static inline void
 vext_ldst_index(void *vd, void *v0, target_ulong base,
                 void *vs2, CPURISCVState *env, uint32_t desc,
                 vext_get_index_addr get_index_addr,
-                vext_ldst_elem_fn *ldst_elem,
+                vext_ldst_elem_fn_tlb *ldst_elem,
                 uint32_t log2_esz, uintptr_t ra)
 {
     uint32_t i, k;
@@ -405,7 +480,8 @@  vext_ldst_index(void *vd, void *v0, target_ulong base,
                 continue;
             }
             abi_ptr addr = get_index_addr(base, i, vs2) + (k << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (i + k * max_elems) << log2_esz, vd, ra);
             k++;
         }
     }
@@ -422,22 +498,22 @@  void HELPER(NAME)(void *vd, void *v0, target_ulong base,                   \
                     LOAD_FN, ctzl(sizeof(ETYPE)), GETPC());                \
 }
 
-GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b)
-GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h)
-GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w)
-GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d)
-GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b)
-GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h)
-GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w)
-GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d)
-GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b)
-GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h)
-GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w)
-GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d)
-GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b)
-GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h)
-GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w)
-GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d)
+GEN_VEXT_LD_INDEX(vlxei8_8_v,   int8_t,  idx_b, lde_b_tlb)
+GEN_VEXT_LD_INDEX(vlxei8_16_v,  int16_t, idx_b, lde_h_tlb)
+GEN_VEXT_LD_INDEX(vlxei8_32_v,  int32_t, idx_b, lde_w_tlb)
+GEN_VEXT_LD_INDEX(vlxei8_64_v,  int64_t, idx_b, lde_d_tlb)
+GEN_VEXT_LD_INDEX(vlxei16_8_v,  int8_t,  idx_h, lde_b_tlb)
+GEN_VEXT_LD_INDEX(vlxei16_16_v, int16_t, idx_h, lde_h_tlb)
+GEN_VEXT_LD_INDEX(vlxei16_32_v, int32_t, idx_h, lde_w_tlb)
+GEN_VEXT_LD_INDEX(vlxei16_64_v, int64_t, idx_h, lde_d_tlb)
+GEN_VEXT_LD_INDEX(vlxei32_8_v,  int8_t,  idx_w, lde_b_tlb)
+GEN_VEXT_LD_INDEX(vlxei32_16_v, int16_t, idx_w, lde_h_tlb)
+GEN_VEXT_LD_INDEX(vlxei32_32_v, int32_t, idx_w, lde_w_tlb)
+GEN_VEXT_LD_INDEX(vlxei32_64_v, int64_t, idx_w, lde_d_tlb)
+GEN_VEXT_LD_INDEX(vlxei64_8_v,  int8_t,  idx_d, lde_b_tlb)
+GEN_VEXT_LD_INDEX(vlxei64_16_v, int16_t, idx_d, lde_h_tlb)
+GEN_VEXT_LD_INDEX(vlxei64_32_v, int32_t, idx_d, lde_w_tlb)
+GEN_VEXT_LD_INDEX(vlxei64_64_v, int64_t, idx_d, lde_d_tlb)
 
 #define GEN_VEXT_ST_INDEX(NAME, ETYPE, INDEX_FN, STORE_FN)       \
 void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
@@ -448,22 +524,22 @@  void HELPER(NAME)(void *vd, void *v0, target_ulong base,         \
                     GETPC());                                    \
 }
 
-GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b)
-GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h)
-GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w)
-GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d)
-GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b)
-GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h)
-GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w)
-GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d)
-GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b)
-GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h)
-GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w)
-GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d)
-GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b)
-GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h)
-GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w)
-GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
+GEN_VEXT_ST_INDEX(vsxei8_8_v,   int8_t,  idx_b, ste_b_tlb)
+GEN_VEXT_ST_INDEX(vsxei8_16_v,  int16_t, idx_b, ste_h_tlb)
+GEN_VEXT_ST_INDEX(vsxei8_32_v,  int32_t, idx_b, ste_w_tlb)
+GEN_VEXT_ST_INDEX(vsxei8_64_v,  int64_t, idx_b, ste_d_tlb)
+GEN_VEXT_ST_INDEX(vsxei16_8_v,  int8_t,  idx_h, ste_b_tlb)
+GEN_VEXT_ST_INDEX(vsxei16_16_v, int16_t, idx_h, ste_h_tlb)
+GEN_VEXT_ST_INDEX(vsxei16_32_v, int32_t, idx_h, ste_w_tlb)
+GEN_VEXT_ST_INDEX(vsxei16_64_v, int64_t, idx_h, ste_d_tlb)
+GEN_VEXT_ST_INDEX(vsxei32_8_v,  int8_t,  idx_w, ste_b_tlb)
+GEN_VEXT_ST_INDEX(vsxei32_16_v, int16_t, idx_w, ste_h_tlb)
+GEN_VEXT_ST_INDEX(vsxei32_32_v, int32_t, idx_w, ste_w_tlb)
+GEN_VEXT_ST_INDEX(vsxei32_64_v, int64_t, idx_w, ste_d_tlb)
+GEN_VEXT_ST_INDEX(vsxei64_8_v,  int8_t,  idx_d, ste_b_tlb)
+GEN_VEXT_ST_INDEX(vsxei64_16_v, int16_t, idx_d, ste_h_tlb)
+GEN_VEXT_ST_INDEX(vsxei64_32_v, int32_t, idx_d, ste_w_tlb)
+GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d_tlb)
 
 /*
  * unit-stride fault-only-fisrt load instructions
@@ -471,7 +547,7 @@  GEN_VEXT_ST_INDEX(vsxei64_64_v, int64_t, idx_d, ste_d)
 static inline void
 vext_ldff(void *vd, void *v0, target_ulong base,
           CPURISCVState *env, uint32_t desc,
-          vext_ldst_elem_fn *ldst_elem,
+          vext_ldst_elem_fn_tlb *ldst_elem,
           uint32_t log2_esz, uintptr_t ra)
 {
     uint32_t i, k, vl = 0;
@@ -539,7 +615,8 @@  vext_ldff(void *vd, void *v0, target_ulong base,
                 continue;
             }
             addr = base + ((i * nf + k) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (i + k * max_elems) << log2_esz, vd, ra);
             k++;
         }
     }
@@ -556,10 +633,10 @@  void HELPER(NAME)(void *vd, void *v0, target_ulong base,  \
               ctzl(sizeof(ETYPE)), GETPC());              \
 }
 
-GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b)
-GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h)
-GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w)
-GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
+GEN_VEXT_LDFF(vle8ff_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LDFF(vle16ff_v, int16_t, lde_h_tlb)
+GEN_VEXT_LDFF(vle32ff_v, int32_t, lde_w_tlb)
+GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d_tlb)
 
 #define DO_SWAP(N, M) (M)
 #define DO_AND(N, M)  (N & M)
@@ -576,7 +653,8 @@  GEN_VEXT_LDFF(vle64ff_v, int64_t, lde_d)
  */
 static void
 vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
-                vext_ldst_elem_fn *ldst_elem, uint32_t log2_esz, uintptr_t ra)
+                vext_ldst_elem_fn_tlb *ldst_elem, uint32_t log2_esz,
+                uintptr_t ra)
 {
     uint32_t i, k, off, pos;
     uint32_t nf = vext_nf(desc);
@@ -595,8 +673,8 @@  vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
         /* load/store rest of elements of current segment pointed by vstart */
         for (pos = off; pos < max_elems; pos++, env->vstart++) {
             target_ulong addr = base + ((pos + k * max_elems) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), pos + k * max_elems, vd,
-                      ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (pos + k * max_elems) << log2_esz, vd, ra);
         }
         k++;
     }
@@ -605,7 +683,8 @@  vext_ldst_whole(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
     for (; k < nf; k++) {
         for (i = 0; i < max_elems; i++, env->vstart++) {
             target_ulong addr = base + ((i + k * max_elems) << log2_esz);
-            ldst_elem(env, adjust_addr(env, addr), i + k * max_elems, vd, ra);
+            ldst_elem(env, adjust_addr(env, addr),
+                      (i + k * max_elems) << log2_esz, vd, ra);
         }
     }
 
@@ -620,22 +699,22 @@  void HELPER(NAME)(void *vd, target_ulong base,       \
                     ctzl(sizeof(ETYPE)), GETPC());   \
 }
 
-GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h)
-GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w)
-GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d)
-GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h)
-GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w)
-GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d)
-GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h)
-GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w)
-GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d)
-GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b)
-GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h)
-GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w)
-GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d)
+GEN_VEXT_LD_WHOLE(vl1re8_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LD_WHOLE(vl1re16_v, int16_t, lde_h_tlb)
+GEN_VEXT_LD_WHOLE(vl1re32_v, int32_t, lde_w_tlb)
+GEN_VEXT_LD_WHOLE(vl1re64_v, int64_t, lde_d_tlb)
+GEN_VEXT_LD_WHOLE(vl2re8_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LD_WHOLE(vl2re16_v, int16_t, lde_h_tlb)
+GEN_VEXT_LD_WHOLE(vl2re32_v, int32_t, lde_w_tlb)
+GEN_VEXT_LD_WHOLE(vl2re64_v, int64_t, lde_d_tlb)
+GEN_VEXT_LD_WHOLE(vl4re8_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LD_WHOLE(vl4re16_v, int16_t, lde_h_tlb)
+GEN_VEXT_LD_WHOLE(vl4re32_v, int32_t, lde_w_tlb)
+GEN_VEXT_LD_WHOLE(vl4re64_v, int64_t, lde_d_tlb)
+GEN_VEXT_LD_WHOLE(vl8re8_v,  int8_t,  lde_b_tlb)
+GEN_VEXT_LD_WHOLE(vl8re16_v, int16_t, lde_h_tlb)
+GEN_VEXT_LD_WHOLE(vl8re32_v, int32_t, lde_w_tlb)
+GEN_VEXT_LD_WHOLE(vl8re64_v, int64_t, lde_d_tlb)
 
 #define GEN_VEXT_ST_WHOLE(NAME, ETYPE, STORE_FN)     \
 void HELPER(NAME)(void *vd, target_ulong base,       \
@@ -645,10 +724,10 @@  void HELPER(NAME)(void *vd, target_ulong base,       \
                     ctzl(sizeof(ETYPE)), GETPC());   \
 }
 
-GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b)
-GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b)
-GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b)
-GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b)
+GEN_VEXT_ST_WHOLE(vs1r_v, int8_t, ste_b_tlb)
+GEN_VEXT_ST_WHOLE(vs2r_v, int8_t, ste_b_tlb)
+GEN_VEXT_ST_WHOLE(vs4r_v, int8_t, ste_b_tlb)
+GEN_VEXT_ST_WHOLE(vs8r_v, int8_t, ste_b_tlb)
 
 /*
  * Vector Integer Arithmetic Instructions