arm64: lib: memory utilities optimization

Optimize memcpy and memmove, to prefetch several cache lines.
We can achieve 15% memcpy speed improvement with the preload method.

Change-Id: I2259b98a33eba0b7466920b3f270f953e609cf13
Signed-off-by: Hong-Mei Li <a21834@motorola.com>
Reviewed-on: http://gerrit.mot.com/740766
SLTApproved: Slta Waiver <sltawvr@motorola.com>
SME-Granted: SME Approvals Granted
Tested-by: Jira Key <jirakey@motorola.com>
Reviewed-by: Zhi-Ming Yuan <a14194@motorola.com>
Submit-Approved: Jira Key <jirakey@motorola.com>
Signed-off-by: Adam W. Willis <return.of.octobot@gmail.com>
Signed-off-by: dreamisbaka <jolinux.g@gmail.com>
This commit is contained in:
Hong-Mei Li 2015-04-21 14:53:35 -07:00 committed by Gagan Malvi
parent 07665b1117
commit 364f97ef25
No known key found for this signature in database
GPG key ID: B932A7CE71E9198F
2 changed files with 4 additions and 0 deletions

View file

@ -51,6 +51,7 @@ C_h .req x12
D_l .req x13
D_h .req x14
prfm pldl1strm, [src, #(1*L1_CACHE_BYTES)]
mov dst, dstin
cmp count, #16
/*When memory length is less than 16, the accessed are not aligned.*/
@ -181,6 +182,7 @@ D_h .req x14
ldp1 C_l, C_h, src, #16
stp1 D_l, D_h, dst, #16
ldp1 D_l, D_h, src, #16
prfm pldl1strm, [src, #(4*L1_CACHE_BYTES)]
subs count, count, #64
b.ge 1b
stp1 A_l, A_h, dst, #16

View file

@ -60,6 +60,7 @@ D_h .req x14
.weak memmove
ENTRY(__memmove)
ENTRY(memmove)
prfm pldl1strm, [src, #L1_CACHE_BYTES]
cmp dstin, src
b.lo __memcpy
add tmp1, src, count
@ -186,6 +187,7 @@ ENTRY(memmove)
ldp C_l, C_h, [src, #-48]
stp D_l, D_h, [dst, #-64]!
ldp D_l, D_h, [src, #-64]!
prfm pldl1strm, [src, #(4*L1_CACHE_BYTES)]
subs count, count, #64
b.ge 1b
stp A_l, A_h, [dst, #-16]