arm64: lib: memory utilities optimization
Optimize memcpy and memmove, to prefetch several cache lines. We can achieve 15% memcpy speed improvement with the preload method. Change-Id: I2259b98a33eba0b7466920b3f270f953e609cf13 Signed-off-by: Hong-Mei Li <a21834@motorola.com> Reviewed-on: http://gerrit.mot.com/740766 SLTApproved: Slta Waiver <sltawvr@motorola.com> SME-Granted: SME Approvals Granted Tested-by: Jira Key <jirakey@motorola.com> Reviewed-by: Zhi-Ming Yuan <a14194@motorola.com> Submit-Approved: Jira Key <jirakey@motorola.com> Signed-off-by: Adam W. Willis <return.of.octobot@gmail.com> Signed-off-by: dreamisbaka <jolinux.g@gmail.com>
This commit is contained in:
parent
07665b1117
commit
364f97ef25
2 changed files with 4 additions and 0 deletions
|
@ -51,6 +51,7 @@ C_h .req x12
|
|||
D_l .req x13
|
||||
D_h .req x14
|
||||
|
||||
prfm pldl1strm, [src, #(1*L1_CACHE_BYTES)]
|
||||
mov dst, dstin
|
||||
cmp count, #16
|
||||
/*When memory length is less than 16, the accessed are not aligned.*/
|
||||
|
@ -181,6 +182,7 @@ D_h .req x14
|
|||
ldp1 C_l, C_h, src, #16
|
||||
stp1 D_l, D_h, dst, #16
|
||||
ldp1 D_l, D_h, src, #16
|
||||
prfm pldl1strm, [src, #(4*L1_CACHE_BYTES)]
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp1 A_l, A_h, dst, #16
|
||||
|
|
|
@ -60,6 +60,7 @@ D_h .req x14
|
|||
.weak memmove
|
||||
ENTRY(__memmove)
|
||||
ENTRY(memmove)
|
||||
prfm pldl1strm, [src, #L1_CACHE_BYTES]
|
||||
cmp dstin, src
|
||||
b.lo __memcpy
|
||||
add tmp1, src, count
|
||||
|
@ -186,6 +187,7 @@ ENTRY(memmove)
|
|||
ldp C_l, C_h, [src, #-48]
|
||||
stp D_l, D_h, [dst, #-64]!
|
||||
ldp D_l, D_h, [src, #-64]!
|
||||
prfm pldl1strm, [src, #(4*L1_CACHE_BYTES)]
|
||||
subs count, count, #64
|
||||
b.ge 1b
|
||||
stp A_l, A_h, [dst, #-16]
|
||||
|
|
Loading…
Reference in a new issue