@@ -132,14 +132,16 @@ D_h .req x14
* Less than 128 bytes to copy, so handle 64 here and then jump
* to the tail.
*/
- ldp1 A_l, A_h, src, #16
- stp1 A_l, A_h, dst, #16
- ldp1 B_l, B_h, src, #16
- ldp1 C_l, C_h, src, #16
- stp1 B_l, B_h, dst, #16
- stp1 C_l, C_h, dst, #16
- ldp1 D_l, D_h, src, #16
- stp1 D_l, D_h, dst, #16
+ ldp2 A_l, A_h, src, #0, #8
+ stp2 A_l, A_h, dst, #0, #8
+ ldp2 B_l, B_h, src, #16, #24
+ ldp2 C_l, C_h, src, #32, #40
+ stp2 B_l, B_h, dst, #16, #24
+ stp2 C_l, C_h, dst, #32, #40
+ ldp2 D_l, D_h, src, #48, #56
+ stp2 D_l, D_h, dst, #48, #56
+ add src, src, #64
+ add dst, dst, #64
tst count, #0x3f
b.ne .Ltail63
When copy less than 128 and ge than 64 bytes, add src/dst after load and store 64 bytes to improve performance. Copy 127 bytes cost on Kunpeng920 (ms): Without this patch: memcpy: 14.62 copy_from_user: 14.23 copy_to_user: 14.42 With this patch: memcpy: 13.85 copy_from_user: 13.26 copy_to_user: 13.84 It's about 5.27% improvement in memcpy(). Signed-off-by: Yang Yingliang <yangyingliang@huawei.com> --- arch/arm64/lib/copy_template.S | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-)