[ARM] 2947/1: copy template with new memcpy/memmove

Patch from Nicolas Pitre This patch provides a new implementation for optimized memory copy functions on ARM. It is made of two levels: a template that consists of the core copy code and separate files that define macros to be used with the core code depending on the type of copy needed. This allows for best performances while sharing the same core for implementing memcpy(), copy_from_user() and copy_to_user() for instance. Two reasons for this work: 1) the current copy_to_user/copy_from_user implementation assumes no task switch will ever occur in the middle of each copied page making it completely unsafe with CONFIG_PREEMPT=y. 2) current copy implementations are measurably suboptimal and optimizing different implementations separately is a pain and more opportunities for bugs. The reason for (1) is the fact that copy inside user pages are performed with the ldm instruction which has no mean for testing user protections and could possibly race with process preemption bypassing the COW mechanism for example. This is a longstanding issue that we said ought to be fixed for about two years now. The solution is to substitute those ldm insns with a series of ldrt or strt insns to enforce user memory protection. At least on StrongARM and XScale cores the ldm is not faster than the equivalent ldr/str insns with a warm i-cache so there is no measurable performance degradation with that change. The fact that the copy code is a template makes it pretty easy to reuse the same core code as for memcpy and benefit from the same performance optimizations. Now (2) is best demonstrated with actual throughput measurements. First, here is a summary of memcopy tests performed on a StrongARM core: PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 59.73 107.43 unaligned 32 61.31 74.72 aligned 100 132.47 136.15 unaligned 100 103.84 123.76 aligned 4096 130.67 130.80 unaligned 4096 130.68 130.64 aligned 1048576 68.03 68.18 unaligned 1048576 68.03 68.18 The buffer size is in bytes and the measured speed in MB/s. The copy was performed repeatedly with given buffer and throughput averaged over 3 seconds. Here we can see that the current kernel version has a higher entry cost that shows up with small buffers. As buffer size grows both implementation converge to the same throughput. Now here's the exact same test performed on an XScale core (PXA255): PTR alignment buffer size kernel version this version ------------------------------------------------------------ aligned 32 46.99 77.58 unaligned 32 53.61 59.59 aligned 100 107.19 136.59 unaligned 100 83.61 97.58 aligned 4096 129.13 129.98 unaligned 4096 128.36 128.53 aligned 1048576 53.76 59.41 unaligned 1048576 33.67 56.96 Again we can see the entry setup cost being higher for the current kernel before getting to the main copy loop. Then throughput results converge as long as the buffer remains in the cache. Then the 1MB case shows more differences probably due to better pld placement and/or less instruction interlocks in this proposed implementation. Disclaimer: The PXA system was running with slower clocks than the StrongARM system so trying to infer any conclusion by comparing those separate sets of results side by side would be completely inappropriate. So... What this patch does is to replace both memcpy and memmove with an implementation based on the provided copy code template. The memmove code is kept separate since it is used only if the memory areas involved do overlap in which case the code is a transposition of the template but with the copy occurring in the opposite direction (trying to fit that mode into the template turned it into a mess not worth it for memmove alone). And obviously both memcpy and memmove were tested with all kinds of pointer alignments and buffer sizes to exercise all code paths for correctness. The next patch will provide the now trivial replacement implementation copy_to_user and copy_from_user. Signed-off-by: Nicolas Pitre <nico@cam.org> Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
2005-11-01 19:52:23 +00:00 · 2005-11-01 19:52:23 +00:00 · 7549423000
commit 7549423000
parent a0c6fdb987
4 changed files with 508 additions and 380 deletions
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@ -7,8 +7,9 @@
 lib-y		:= backtrace.o changebit.o csumipv6.o csumpartial.o   \
 		   csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
 		   copy_page.o delay.o findbit.o memchr.o memcpy.o    \
-		   memset.o memzero.o setbit.o strncpy_from_user.o    \
+		   memmove.o memset.o memzero.o setbit.o              \
-		   strnlen_user.o strchr.o strrchr.o testchangebit.o  \
+		   strncpy_from_user.o strnlen_user.o                 \
 		   strchr.o strrchr.o testchangebit.o                 \
 		   testclearbit.o testsetbit.o uaccess.o              \
 		   getuser.o putuser.o clear_user.o                   \
 		   ashldi3.o ashrdi3.o lshrdi3.o muldi3.o             \
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@ -0,0 +1,255 @@
 /*
 *  linux/arch/arm/lib/copy_template.s
 *
 *  Code template for optimized memory copy functions
 *
 *  Author:	Nicolas Pitre
 *  Created:	Sep 28, 2005
 *  Copyright:	MontaVista Software, Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
 /*
 * This can be used to enable code to cacheline align the source pointer.
 * Experiments on tested architectures (StrongARM and XScale) didn't show
 * this a worthwhile thing to do.  That might be different in the future.
 */
 //#define CALGN(code...)	code
 #define CALGN(code...)
 /*
 * Theory of operation
 * -------------------
 *
 * This file provides the core code for a forward memory copy used in
 * the implementation of memcopy(), copy_to_user() and copy_from_user().
 *
 * The including file must define the following accessor macros
 * according to the need of the given function:
 *
 * ldr1w ptr reg abort
 *
 *	This loads one word from 'ptr', stores it in 'reg' and increments
 *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
 *
 * ldr4w ptr reg1 reg2 reg3 reg4 abort
 * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 *
 *	This loads four or eight words starting from 'ptr', stores them
 *	in provided registers and increments 'ptr' past those words.
 *	The'abort' argument is used for fixup tables.
 *
 * ldr1b ptr reg cond abort
 *
 *	Similar to ldr1w, but it loads a byte and increments 'ptr' one byte.
 *	It also must apply the condition code if provided, otherwise the
 *	"al" condition is assumed by default.
 *
 * str1w ptr reg abort
 * str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 * str1b ptr reg cond abort
 *
 *	Same as their ldr* counterparts, but data is stored to 'ptr' location
 *	rather than being loaded.
 *
 * enter reg1 reg2
 *
 *	Preserve the provided registers on the stack plus any additional
 *	data as needed by the implementation including this code. Called
 *	upon code entry.
 *
 * exit reg1 reg2
 *
 *	Restore registers with the values previously saved with the
 *	'preserv' macro. Called upon code termination.
 */
 		enter	r4, lr
 		subs	r2, r2, #4
 		blt	8f
 		ands	ip, r0, #3
 	PLD(	pld	[r1, #0]		)
 		bne	9f
 		ands	ip, r1, #3
 		bne	10f
 1:		subs	r2, r2, #(28)
 		stmfd	sp!, {r5 - r8}
 		blt	5f
 	CALGN(	ands	ip, r1, #31		)
 	CALGN(	rsb	r3, ip, #32		)
 	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
 	CALGN(	subs	r2, r2, r3		)  @ C gets set
 	CALGN(	add	pc, r4, ip		)
 	PLD(	pld	[r1, #0]		)
 2:	PLD(	subs	r2, r2, #96		)
 	PLD(	pld	[r1, #28]		)
 	PLD(	blt	4f			)
 	PLD(	pld	[r1, #60]		)
 	PLD(	pld	[r1, #92]		)
 3:	PLD(	pld	[r1, #124]		)
 4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
 		subs	r2, r2, #32
 		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
 		bge	3b
 	PLD(	cmn	r2, #96			)
 	PLD(	bge	4b			)
 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
 		addne	pc, pc, ip		@ C is always clear here
 		b	7f
 6:		nop
 		ldr1w	r1, r3, abort=20f
 		ldr1w	r1, r4, abort=20f
 		ldr1w	r1, r5, abort=20f
 		ldr1w	r1, r6, abort=20f
 		ldr1w	r1, r7, abort=20f
 		ldr1w	r1, r8, abort=20f
 		ldr1w	r1, lr, abort=20f
 		add	pc, pc, ip
 		nop
 		nop
 		str1w	r0, r3, abort=20f
 		str1w	r0, r4, abort=20f
 		str1w	r0, r5, abort=20f
 		str1w	r0, r6, abort=20f
 		str1w	r0, r7, abort=20f
 		str1w	r0, r8, abort=20f
 		str1w	r0, lr, abort=20f
 	CALGN(	bcs	2b			)
 7:		ldmfd	sp!, {r5 - r8}
 8:		movs	r2, r2, lsl #31
 		ldr1b	r1, r3, ne, abort=21f
 		ldr1b	r1, r4, cs, abort=21f
 		ldr1b	r1, ip, cs, abort=21f
 		str1b	r0, r3, ne, abort=21f
 		str1b	r0, r4, cs, abort=21f
 		str1b	r0, ip, cs, abort=21f
 		exit	r4, pc
 9:		rsb	ip, ip, #4
 		cmp	ip, #2
 		ldr1b	r1, r3, gt, abort=21f
 		ldr1b	r1, r4, ge, abort=21f
 		ldr1b	r1, lr, abort=21f
 		str1b	r0, r3, gt, abort=21f
 		str1b	r0, r4, ge, abort=21f
 		subs	r2, r2, ip
 		str1b	r0, lr, abort=21f
 		blt	8b
 		ands	ip, r1, #3
 		beq	1b
 10:		bic	r1, r1, #3
 		cmp	ip, #2
 		ldr1w	r1, lr, abort=21f
 		beq	17f
 		bgt	18f
 		.macro	forward_copy_shift pull push
 		subs	r2, r2, #28
 		blt	14f
 	CALGN(	ands	ip, r1, #31		)
 	CALGN(	rsb	ip, ip, #32		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
 	CALGN(	bcc	15f			)
 11:		stmfd	sp!, {r5 - r9}
 	PLD(	pld	[r1, #0]		)
 	PLD(	subs	r2, r2, #96		)
 	PLD(	pld	[r1, #28]		)
 	PLD(	blt	13f			)
 	PLD(	pld	[r1, #60]		)
 	PLD(	pld	[r1, #92]		)
 12:	PLD(	pld	[r1, #124]		)
 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
 		mov	r3, lr, pull #\pull
 		subs	r2, r2, #32
 		ldr4w	r1, r8, r9, ip, lr, abort=19f
 		orr	r3, r3, r4, push #\push
 		mov	r4, r4, pull #\pull
 		orr	r4, r4, r5, push #\push
 		mov	r5, r5, pull #\pull
 		orr	r5, r5, r6, push #\push
 		mov	r6, r6, pull #\pull
 		orr	r6, r6, r7, push #\push
 		mov	r7, r7, pull #\pull
 		orr	r7, r7, r8, push #\push
 		mov	r8, r8, pull #\pull
 		orr	r8, r8, r9, push #\push
 		mov	r9, r9, pull #\pull
 		orr	r9, r9, ip, push #\push
 		mov	ip, ip, pull #\pull
 		orr	ip, ip, lr, push #\push
 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
 		bge	12b
 	PLD(	cmn	r2, #96			)
 	PLD(	bge	13b			)
 		ldmfd	sp!, {r5 - r9}
 14:		ands	ip, r2, #28
 		beq	16f
 15:		mov	r3, lr, pull #\pull
 		ldr1w	r1, lr, abort=21f
 		subs	ip, ip, #4
 		orr	r3, r3, lr, push #\push
 		str1w	r0, r3, abort=21f
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
 	CALGN(	bge	11b			)
 16:		sub	r1, r1, #(\push / 8)
 		b	8b
 		.endm
 		forward_copy_shift	pull=8	push=24
 17:		forward_copy_shift	pull=16	push=16
 18:		forward_copy_shift	pull=24	push=8
 /*
 * Abort preanble and completion macros.
 * If a fixup handler is required then those macros must surround it.
 * It is assumed that the fixup code will handle the private part of
 * the exit macro.
 */
 	.macro	copy_abort_preamble
 19:	ldmfd	sp!, {r5 - r9}
 	b	21f
 20:	ldmfd	sp!, {r5 - r8}
 21:
 	.endm
 	.macro	copy_abort_end
 	ldmfd	sp!, {r4, pc}
 	.endm
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@ -1,393 +1,59 @@
 /*
 *  linux/arch/arm/lib/memcpy.S
 *
- *  Copyright (C) 1995-1999 Russell King
+ *  Author:	Nicolas Pitre
 *  Created:	Sep 28, 2005
 *  Copyright:	MontaVista Software, Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 *
 *  ASM optimised string functions
 */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 	.macro ldr1w ptr reg abort
 	ldr \reg, [\ptr], #4
 	.endm
 	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
 	.endm
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
 	.macro ldr1b ptr reg cond=al abort
 	ldr\cond\()b \reg, [\ptr], #1
 	.endm
 	.macro str1w ptr reg abort
 	str \reg, [\ptr], #4
 	.endm
 	.macro str8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	stmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
 	.macro str1b ptr reg cond=al abort
 	str\cond\()b \reg, [\ptr], #1
 	.endm
 	.macro enter reg1 reg2
 	stmdb sp!, {r0, \reg1, \reg2}
 	.endm
 	.macro exit reg1 reg2
 	ldmfd sp!, {r0, \reg1, \reg2}
 	.endm
 	.text
-#define ENTER	\
+/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
 		mov	ip,sp	;\
 		stmfd	sp!,{r0,r4-r9,fp,ip,lr,pc}	;\
 		sub	fp,ip,#4
 #define EXIT	\
 		LOADREGS(ea, fp, {r0, r4 - r9, fp, sp, pc})
 #define EXITEQ	\
 		LOADREGS(eqea, fp, {r0, r4 - r9, fp, sp, pc})
 /*
 * Prototype: void memcpy(void *to,const void *from,unsigned long n);
 */
 ENTRY(memcpy)
 ENTRY(memmove)
 		ENTER
 		cmp	r1, r0
 		bcc	23f
 		subs	r2, r2, #4
 		blt	6f
 	PLD(	pld	[r1, #0]		)
 		ands	ip, r0, #3
 		bne	7f
 		ands	ip, r1, #3
 		bne	8f
-1:		subs	r2, r2, #8
+#include "copy_template.S"
 		blt	5f
 		subs	r2, r2, #20
 		blt	4f
 	PLD(	pld	[r1, #28]		)
 	PLD(	subs	r2, r2, #64		)
 	PLD(	blt	3f			)
 2:	PLD(	pld	[r1, #60]		)
 	PLD(	pld	[r1, #92]		)
 		ldmia	r1!, {r3 - r9, ip}
 		subs	r2, r2, #32
 		stmgeia	r0!, {r3 - r9, ip}
 		ldmgeia	r1!, {r3 - r9, ip}
 		subges	r2, r2, #32
 		stmia	r0!, {r3 - r9, ip}
 		bge	2b
 3:	PLD(	ldmia	r1!, {r3 - r9, ip}	)
 	PLD(	adds	r2, r2, #32		)
 	PLD(	stmgeia	r0!, {r3 - r9, ip}	)
 	PLD(	ldmgeia	r1!, {r3 - r9, ip}	)
 	PLD(	subges	r2, r2, #32		)
 	PLD(	stmia	r0!, {r3 - r9, ip}	)
 4:		cmn	r2, #16
 		ldmgeia	r1!, {r3 - r6}
 		subge	r2, r2, #16
 		stmgeia	r0!, {r3 - r6}
 		adds	r2, r2, #20
 		ldmgeia	r1!, {r3 - r5}
 		subge	r2, r2, #12
 		stmgeia	r0!, {r3 - r5}
 5:		adds	r2, r2, #8
 		blt	6f
 		subs	r2, r2, #4
 		ldrlt	r3, [r1], #4
 		ldmgeia	r1!, {r4, r5}
 		subge	r2, r2, #4
 		strlt	r3, [r0], #4
 		stmgeia	r0!, {r4, r5}
 6:		adds	r2, r2, #4
 		EXITEQ
 		cmp	r2, #2
 		ldrb	r3, [r1], #1
 		ldrgeb	r4, [r1], #1
 		ldrgtb	r5, [r1], #1
 		strb	r3, [r0], #1
 		strgeb	r4, [r0], #1
 		strgtb	r5, [r0], #1
 		EXIT
 7:		rsb	ip, ip, #4
 		cmp	ip, #2
 		ldrb	r3, [r1], #1
 		ldrgeb	r4, [r1], #1
 		ldrgtb	r5, [r1], #1
 		strb	r3, [r0], #1
 		strgeb	r4, [r0], #1
 		strgtb	r5, [r0], #1
 		subs	r2, r2, ip
 		blt	6b
 		ands	ip, r1, #3
 		beq	1b
 8:		bic	r1, r1, #3
 		ldr	r7, [r1], #4
 		cmp	ip, #2
 		bgt	18f
 		beq	13f
 		cmp	r2, #12
 		blt	11f
 	PLD(	pld	[r1, #12]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	10f			)
 	PLD(	pld	[r1, #28]		)
 9:	PLD(	pld	[r1, #44]		)
 10:		mov	r3, r7, pull #8
 		ldmia	r1!, {r4 - r7}
 		subs	r2, r2, #16
 		orr	r3, r3, r4, push #24
 		mov	r4, r4, pull #8
 		orr	r4, r4, r5, push #24
 		mov	r5, r5, pull #8
 		orr	r5, r5, r6, push #24
 		mov	r6, r6, pull #8
 		orr	r6, r6, r7, push #24
 		stmia	r0!, {r3 - r6}
 		bge	9b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	10b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	12f
 11:		mov	r3, r7, pull #8
 		ldr	r7, [r1], #4
 		subs	r2, r2, #4
 		orr	r3, r3, r7, push #24
 		str	r3, [r0], #4
 		bge	11b
 12:		sub	r1, r1, #3
 		b	6b
 13:		cmp	r2, #12
 		blt	16f
 	PLD(	pld	[r1, #12]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	15f			)
 	PLD(	pld	[r1, #28]		)
 14:	PLD(	pld	[r1, #44]		)
 15:		mov	r3, r7, pull #16
 		ldmia	r1!, {r4 - r7}
 		subs	r2, r2, #16
 		orr	r3, r3, r4, push #16
 		mov	r4, r4, pull #16
 		orr	r4, r4, r5, push #16
 		mov	r5, r5, pull #16
 		orr	r5, r5, r6, push #16
 		mov	r6, r6, pull #16
 		orr	r6, r6, r7, push #16
 		stmia	r0!, {r3 - r6}
 		bge	14b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	15b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	17f
 16:		mov	r3, r7, pull #16
 		ldr	r7, [r1], #4
 		subs	r2, r2, #4
 		orr	r3, r3, r7, push #16
 		str	r3, [r0], #4
 		bge	16b
 17:		sub	r1, r1, #2
 		b	6b
 18:		cmp	r2, #12
 		blt	21f
 	PLD(	pld	[r1, #12]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	20f			)
 	PLD(	pld	[r1, #28]		)
 19:	PLD(	pld	[r1, #44]		)
 20:		mov	r3, r7, pull #24
 		ldmia	r1!, {r4 - r7}
 		subs	r2, r2, #16
 		orr	r3, r3, r4, push #8
 		mov	r4, r4, pull #24
 		orr	r4, r4, r5, push #8
 		mov	r5, r5, pull #24
 		orr	r5, r5, r6, push #8
 		mov	r6, r6, pull #24
 		orr	r6, r6, r7, push #8
 		stmia	r0!, {r3 - r6}
 		bge	19b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	20b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	22f
 21:		mov	r3, r7, pull #24
 		ldr	r7, [r1], #4
 		subs	r2, r2, #4
 		orr	r3, r3, r7, push #8
 		str	r3, [r0], #4
 		bge	21b
 22:		sub	r1, r1, #1
 		b	6b
 23:		add	r1, r1, r2
 		add	r0, r0, r2
 		subs	r2, r2, #4
 		blt	29f
 	PLD(	pld	[r1, #-4]		)
 		ands	ip, r0, #3
 		bne	30f
 		ands	ip, r1, #3
 		bne	31f
 24:		subs	r2, r2, #8
 		blt	28f
 		subs	r2, r2, #20
 		blt	27f
 	PLD(	pld	[r1, #-32]		)
 	PLD(	subs	r2, r2, #64		)
 	PLD(	blt	26f			)
 25:	PLD(	pld	[r1, #-64]		)
 	PLD(	pld	[r1, #-96]		)
 		ldmdb	r1!, {r3 - r9, ip}
 		subs	r2, r2, #32
 		stmgedb	r0!, {r3 - r9, ip}
 		ldmgedb	r1!, {r3 - r9, ip}
 		subges	r2, r2, #32
 		stmdb	r0!, {r3 - r9, ip}
 		bge	25b
 26:	PLD(	ldmdb	r1!, {r3 - r9, ip}	)
 	PLD(	adds	r2, r2, #32		)
 	PLD(	stmgedb	r0!, {r3 - r9, ip}	)
 	PLD(	ldmgedb	r1!, {r3 - r9, ip}	)
 	PLD(	subges	r2, r2, #32		)
 	PLD(	stmdb	r0!, {r3 - r9, ip}	)
 27:		cmn	r2, #16
 		ldmgedb	r1!, {r3 - r6}
 		subge	r2, r2, #16
 		stmgedb	r0!, {r3 - r6}
 		adds	r2, r2, #20
 		ldmgedb	r1!, {r3 - r5}
 		subge	r2, r2, #12
 		stmgedb	r0!, {r3 - r5}
 28:		adds	r2, r2, #8
 		blt	29f
 		subs	r2, r2, #4
 		ldrlt	r3, [r1, #-4]!
 		ldmgedb	r1!, {r4, r5}
 		subge	r2, r2, #4
 		strlt	r3, [r0, #-4]!
 		stmgedb	r0!, {r4, r5}
 29:		adds	r2, r2, #4
 		EXITEQ
 		cmp	r2, #2
 		ldrb	r3, [r1, #-1]!
 		ldrgeb	r4, [r1, #-1]!
 		ldrgtb	r5, [r1, #-1]!
 		strb	r3, [r0, #-1]!
 		strgeb	r4, [r0, #-1]!
 		strgtb	r5, [r0, #-1]!
 		EXIT
 30:		cmp	ip, #2
 		ldrb	r3, [r1, #-1]!
 		ldrgeb	r4, [r1, #-1]!
 		ldrgtb	r5, [r1, #-1]!
 		strb	r3, [r0, #-1]!
 		strgeb	r4, [r0, #-1]!
 		strgtb	r5, [r0, #-1]!
 		subs	r2, r2, ip
 		blt	29b
 		ands	ip, r1, #3
 		beq	24b
 31:		bic	r1, r1, #3
 		ldr	r3, [r1], #0
 		cmp	ip, #2
 		blt	41f
 		beq	36f
 		cmp	r2, #12
 		blt	34f
 	PLD(	pld	[r1, #-16]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	33f			)
 	PLD(	pld	[r1, #-32]		)
 32:	PLD(	pld	[r1, #-48]		)
 33:		mov	r7, r3, push #8
 		ldmdb	r1!, {r3, r4, r5, r6}
 		subs	r2, r2, #16
 		orr	r7, r7, r6, pull #24
 		mov	r6, r6, push #8
 		orr	r6, r6, r5, pull #24
 		mov	r5, r5, push #8
 		orr	r5, r5, r4, pull #24
 		mov	r4, r4, push #8
 		orr	r4, r4, r3, pull #24
 		stmdb	r0!, {r4, r5, r6, r7}
 		bge	32b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	33b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	35f
 34:		mov	ip, r3, push #8
 		ldr	r3, [r1, #-4]!
 		subs	r2, r2, #4
 		orr	ip, ip, r3, pull #24
 		str	ip, [r0, #-4]!
 		bge	34b
 35:		add	r1, r1, #3
 		b	29b
 36:		cmp	r2, #12
 		blt	39f
 	PLD(	pld	[r1, #-16]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	38f			)
 	PLD(	pld	[r1, #-32]		)
 37:	PLD(	pld	[r1, #-48]		)
 38:		mov	r7, r3, push #16
 		ldmdb	r1!, {r3, r4, r5, r6}
 		subs	r2, r2, #16
 		orr	r7, r7, r6, pull #16
 		mov	r6, r6, push #16
 		orr	r6, r6, r5, pull #16
 		mov	r5, r5, push #16
 		orr	r5, r5, r4, pull #16
 		mov	r4, r4, push #16
 		orr	r4, r4, r3, pull #16
 		stmdb	r0!, {r4, r5, r6, r7}
 		bge	37b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	38b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	40f
 39:		mov	ip, r3, push #16
 		ldr	r3, [r1, #-4]!
 		subs	r2, r2, #4
 		orr	ip, ip, r3, pull #16
 		str	ip, [r0, #-4]!
 		bge	39b
 40:		add	r1, r1, #2
 		b	29b
 41:		cmp	r2, #12
 		blt	44f
 	PLD(	pld	[r1, #-16]		)
 		sub	r2, r2, #12
 	PLD(	subs	r2, r2, #32		)
 	PLD(	blt	43f			)
 	PLD(	pld	[r1, #-32]		)
 42:	PLD(	pld	[r1, #-48]		)
 43:		mov	r7, r3, push #24
 		ldmdb	r1!, {r3, r4, r5, r6}
 		subs	r2, r2, #16
 		orr	r7, r7, r6, pull #8
 		mov	r6, r6, push #24
 		orr	r6, r6, r5, pull #8
 		mov	r5, r5, push #24
 		orr	r5, r5, r4, pull #8
 		mov	r4, r4, push #24
 		orr	r4, r4, r3, pull #8
 		stmdb	r0!, {r4, r5, r6, r7}
 		bge	42b
 	PLD(	cmn	r2, #32			)
 	PLD(	bge	43b			)
 	PLD(	add	r2, r2, #32		)
 		adds	r2, r2, #12
 		blt	45f
 44:		mov	ip, r3, push #24
 		ldr	r3, [r1, #-4]!
 		subs	r2, r2, #4
 		orr	ip, ip, r3, pull #8
 		str	ip, [r0, #-4]!
 		bge	44b
 45:		add	r1, r1, #1
 		b	29b
--- a/arch/arm/lib/memmove.S
+++ b/arch/arm/lib/memmove.S
@ -0,0 +1,206 @@
 /*
 *  linux/arch/arm/lib/memmove.S
 *
 *  Author:	Nicolas Pitre
 *  Created:	Sep 28, 2005
 *  Copyright:	(C) MontaVista Software Inc.
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License version 2 as
 *  published by the Free Software Foundation.
 */
 #include <linux/linkage.h>
 #include <asm/assembler.h>
 /*
 * This can be used to enable code to cacheline align the source pointer.
 * Experiments on tested architectures (StrongARM and XScale) didn't show
 * this a worthwhile thing to do.  That might be different in the future.
 */
 //#define CALGN(code...)        code
 #define CALGN(code...)
 		.text
 /*
 * Prototype: void *memmove(void *dest, const void *src, size_t n);
 *
 * Note:
 *
 * If the memory regions don't overlap, we simply branch to memcpy which is
 * normally a bit faster. Otherwise the copy is done going downwards.  This
 * is a transposition of the code from copy_template.S but with the copy
 * occurring in the opposite direction.
 */
 ENTRY(memmove)
 		subs	ip, r0, r1
 		cmphi	r2, ip
 		bls	memcpy
 		stmfd	sp!, {r0, r4, lr}
 		add	r1, r1, r2
 		add	r0, r0, r2
 		subs	r2, r2, #4
 		blt	8f
 		ands	ip, r0, #3
 	PLD(	pld	[r1, #-4]		)
 		bne	9f
 		ands	ip, r1, #3
 		bne	10f
 1:		subs	r2, r2, #(28)
 		stmfd	sp!, {r5 - r8}
 		blt	5f
 	CALGN(	ands	ip, r1, #31		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
 	CALGN(	subs	r2, r2, ip		)  @ C is set here
 	CALGN(	add	pc, r4, ip		)
 	PLD(	pld	[r1, #-4]		)
 2:	PLD(	subs	r2, r2, #96		)
 	PLD(	pld	[r1, #-32]		)
 	PLD(	blt	4f			)
 	PLD(	pld	[r1, #-64]		)
 	PLD(	pld	[r1, #-96]		)
 3:	PLD(	pld	[r1, #-128]		)
 4:		ldmdb	r1!, {r3, r4, r5, r6, r7, r8, ip, lr}
 		subs	r2, r2, #32
 		stmdb	r0!, {r3, r4, r5, r6, r7, r8, ip, lr}
 		bge	3b
 	PLD(	cmn	r2, #96			)
 	PLD(	bge	4b			)
 5:		ands	ip, r2, #28
 		rsb	ip, ip, #32
 		addne	pc, pc, ip		@ C is always clear here
 		b	7f
 6:		nop
 		ldr	r3, [r1, #-4]!
 		ldr	r4, [r1, #-4]!
 		ldr	r5, [r1, #-4]!
 		ldr	r6, [r1, #-4]!
 		ldr	r7, [r1, #-4]!
 		ldr	r8, [r1, #-4]!
 		ldr	lr, [r1, #-4]!
 		add	pc, pc, ip
 		nop
 		nop
 		str	r3, [r0, #-4]!
 		str	r4, [r0, #-4]!
 		str	r5, [r0, #-4]!
 		str	r6, [r0, #-4]!
 		str	r7, [r0, #-4]!
 		str	r8, [r0, #-4]!
 		str	lr, [r0, #-4]!
 	CALGN(	bcs	2b			)
 7:		ldmfd	sp!, {r5 - r8}
 8:		movs	r2, r2, lsl #31
 		ldrneb	r3, [r1, #-1]!
 		ldrcsb	r4, [r1, #-1]!
 		ldrcsb	ip, [r1, #-1]
 		strneb	r3, [r0, #-1]!
 		strcsb	r4, [r0, #-1]!
 		strcsb	ip, [r0, #-1]
 		ldmfd	sp!, {r0, r4, pc}
 9:		cmp	ip, #2
 		ldrgtb	r3, [r1, #-1]!
 		ldrgeb	r4, [r1, #-1]!
 		ldrb	lr, [r1, #-1]!
 		strgtb	r3, [r0, #-1]!
 		strgeb	r4, [r0, #-1]!
 		subs	r2, r2, ip
 		strb	lr, [r0, #-1]!
 		blt	8b
 		ands	ip, r1, #3
 		beq	1b
 10:		bic	r1, r1, #3
 		cmp	ip, #2
 		ldr	r3, [r1, #0]
 		beq	17f
 		blt	18f
 		.macro	backward_copy_shift push pull
 		subs	r2, r2, #28
 		blt	14f
 	CALGN(	ands	ip, r1, #31		)
 	CALGN(	rsb	ip, ip, #32		)
 	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
 	CALGN(	bcc	15f			)
 11:		stmfd	sp!, {r5 - r9}
 	PLD(	pld	[r1, #-4]		)
 	PLD(	subs	r2, r2, #96		)
 	PLD(	pld	[r1, #-32]		)
 	PLD(	blt	13f			)
 	PLD(	pld	[r1, #-64]		)
 	PLD(	pld	[r1, #-96]		)
 12:	PLD(	pld	[r1, #-128]		)
 13:		ldmdb   r1!, {r7, r8, r9, ip}
 		mov     lr, r3, push #\push
 		subs    r2, r2, #32
 		ldmdb   r1!, {r3, r4, r5, r6}
 		orr     lr, lr, ip, pull #\pull
 		mov     ip, ip, push #\push
 		orr     ip, ip, r9, pull #\pull
 		mov     r9, r9, push #\push
 		orr     r9, r9, r8, pull #\pull
 		mov     r8, r8, push #\push
 		orr     r8, r8, r7, pull #\pull
 		mov     r7, r7, push #\push
 		orr     r7, r7, r6, pull #\pull
 		mov     r6, r6, push #\push
 		orr     r6, r6, r5, pull #\pull
 		mov     r5, r5, push #\push
 		orr     r5, r5, r4, pull #\pull
 		mov     r4, r4, push #\push
 		orr     r4, r4, r3, pull #\pull
 		stmdb   r0!, {r4 - r9, ip, lr}
 		bge	12b
 	PLD(	cmn	r2, #96			)
 	PLD(	bge	13b			)
 		ldmfd	sp!, {r5 - r9}
 14:		ands	ip, r2, #28
 		beq	16f
 15:		mov     lr, r3, push #\push
 		ldr	r3, [r1, #-4]!
 		subs	ip, ip, #4
 		orr	lr, lr, r3, pull #\pull
 		str	lr, [r0, #-4]!
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
 	CALGN(	bge	11b			)
 16:		add	r1, r1, #(\pull / 8)
 		b	8b
 		.endm
 		backward_copy_shift	push=8	pull=24
 17:		backward_copy_shift	push=16	pull=16
 18:		backward_copy_shift	push=24	pull=8