83a7a2ad2a
We already have cpufeature indicies above 255, so use a 16-bit number for the alternatives index. This consumes a padding field and so doesn't add any size, but it means that abusing the padding field to create assembly errors on overflow no longer works. We can retain the test simply by redirecting it to the .discard section, however. [ v3: updated to include open-coded locations ] Signed-off-by: H. Peter Anvin <hpa@linux.intel.com> LKML-Reference: <tip-f88731e3068f9d1392ba71cc9f50f035d26a0d4f@git.kernel.org> Signed-off-by: H. Peter Anvin <hpa@zytor.com>
119 lines
2.3 KiB
ArmAsm
119 lines
2.3 KiB
ArmAsm
/* Written 2003 by Andi Kleen, based on a kernel by Evandro Menezes */
|
|
|
|
#include <linux/linkage.h>
|
|
#include <asm/dwarf2.h>
|
|
|
|
ALIGN
|
|
copy_page_c:
|
|
CFI_STARTPROC
|
|
movl $4096/8,%ecx
|
|
rep movsq
|
|
ret
|
|
CFI_ENDPROC
|
|
ENDPROC(copy_page_c)
|
|
|
|
/* Don't use streaming store because it's better when the target
|
|
ends up in cache. */
|
|
|
|
/* Could vary the prefetch distance based on SMP/UP */
|
|
|
|
ENTRY(copy_page)
|
|
CFI_STARTPROC
|
|
subq $3*8,%rsp
|
|
CFI_ADJUST_CFA_OFFSET 3*8
|
|
movq %rbx,(%rsp)
|
|
CFI_REL_OFFSET rbx, 0
|
|
movq %r12,1*8(%rsp)
|
|
CFI_REL_OFFSET r12, 1*8
|
|
movq %r13,2*8(%rsp)
|
|
CFI_REL_OFFSET r13, 2*8
|
|
|
|
movl $(4096/64)-5,%ecx
|
|
.p2align 4
|
|
.Loop64:
|
|
dec %rcx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8 (%rsi), %rbx
|
|
movq 16 (%rsi), %rdx
|
|
movq 24 (%rsi), %r8
|
|
movq 32 (%rsi), %r9
|
|
movq 40 (%rsi), %r10
|
|
movq 48 (%rsi), %r11
|
|
movq 56 (%rsi), %r12
|
|
|
|
prefetcht0 5*64(%rsi)
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8 (%rdi)
|
|
movq %rdx, 16 (%rdi)
|
|
movq %r8, 24 (%rdi)
|
|
movq %r9, 32 (%rdi)
|
|
movq %r10, 40 (%rdi)
|
|
movq %r11, 48 (%rdi)
|
|
movq %r12, 56 (%rdi)
|
|
|
|
leaq 64 (%rsi), %rsi
|
|
leaq 64 (%rdi), %rdi
|
|
|
|
jnz .Loop64
|
|
|
|
movl $5,%ecx
|
|
.p2align 4
|
|
.Loop2:
|
|
decl %ecx
|
|
|
|
movq (%rsi), %rax
|
|
movq 8 (%rsi), %rbx
|
|
movq 16 (%rsi), %rdx
|
|
movq 24 (%rsi), %r8
|
|
movq 32 (%rsi), %r9
|
|
movq 40 (%rsi), %r10
|
|
movq 48 (%rsi), %r11
|
|
movq 56 (%rsi), %r12
|
|
|
|
movq %rax, (%rdi)
|
|
movq %rbx, 8 (%rdi)
|
|
movq %rdx, 16 (%rdi)
|
|
movq %r8, 24 (%rdi)
|
|
movq %r9, 32 (%rdi)
|
|
movq %r10, 40 (%rdi)
|
|
movq %r11, 48 (%rdi)
|
|
movq %r12, 56 (%rdi)
|
|
|
|
leaq 64(%rdi),%rdi
|
|
leaq 64(%rsi),%rsi
|
|
|
|
jnz .Loop2
|
|
|
|
movq (%rsp),%rbx
|
|
CFI_RESTORE rbx
|
|
movq 1*8(%rsp),%r12
|
|
CFI_RESTORE r12
|
|
movq 2*8(%rsp),%r13
|
|
CFI_RESTORE r13
|
|
addq $3*8,%rsp
|
|
CFI_ADJUST_CFA_OFFSET -3*8
|
|
ret
|
|
.Lcopy_page_end:
|
|
CFI_ENDPROC
|
|
ENDPROC(copy_page)
|
|
|
|
/* Some CPUs run faster using the string copy instructions.
|
|
It is also a lot simpler. Use this when possible */
|
|
|
|
#include <asm/cpufeature.h>
|
|
|
|
.section .altinstr_replacement,"ax"
|
|
1: .byte 0xeb /* jmp <disp8> */
|
|
.byte (copy_page_c - copy_page) - (2f - 1b) /* offset */
|
|
2:
|
|
.previous
|
|
.section .altinstructions,"a"
|
|
.align 8
|
|
.quad copy_page
|
|
.quad 1b
|
|
.word X86_FEATURE_REP_GOOD
|
|
.byte .Lcopy_page_end - copy_page
|
|
.byte 2b - 1b
|
|
.previous
|