From 5d0b7235d83eefdafda300656e97d368afcafc9a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Jan 2010 17:57:35 -0800 Subject: [PATCH 1/5] x86: clean up rwsem type system The fast version of the rwsems (the code that uses xadd) has traditionally only worked on x86-32, and as a result it mixes different kinds of types wildly - they just all happen to be 32-bit. We have "long", we have "__s32", and we have "int". To make it work on x86-64, the types suddenly matter a lot more. It can be either a 32-bit or 64-bit signed type, and both work (with the caveat that a 32-bit counter will only have 15 bits of effective write counters, so it's limited to 32767 users). But whatever type you choose, it needs to be used consistently. This makes a new 'rwsem_counter_t', that is a 32-bit signed type. For a 64-bit type, you'd need to also update the BIAS values. Signed-off-by: Linus Torvalds LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 413620024768..5f9af3081d66 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -55,6 +55,9 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition + * + * The bias values and the counter type needs to be extended to 64 bits + * if we want to have more than 32767 potential readers/writers */ #define RWSEM_UNLOCKED_VALUE 0x00000000 @@ -64,8 +67,10 @@ extern asmregparm struct rw_semaphore * #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) +typedef signed int rwsem_count_t; + struct rw_semaphore { - signed long count; + rwsem_count_t count; spinlock_t wait_lock; struct list_head wait_list; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -121,7 +126,7 @@ static inline void __down_read(struct rw_semaphore *sem) */ static inline int __down_read_trylock(struct rw_semaphore *sem) { - __s32 result, tmp; + rwsem_count_t result, tmp; asm volatile("# beginning __down_read_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -143,7 +148,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) */ static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) { - int tmp; + rwsem_count_t tmp; tmp = RWSEM_ACTIVE_WRITE_BIAS; asm volatile("# beginning down_write\n\t" @@ -170,9 +175,9 @@ static inline void __down_write(struct rw_semaphore *sem) */ static inline int __down_write_trylock(struct rw_semaphore *sem) { - signed long ret = cmpxchg(&sem->count, - RWSEM_UNLOCKED_VALUE, - RWSEM_ACTIVE_WRITE_BIAS); + rwsem_count_t ret = cmpxchg(&sem->count, + RWSEM_UNLOCKED_VALUE, + RWSEM_ACTIVE_WRITE_BIAS); if (ret == RWSEM_UNLOCKED_VALUE) return 1; return 0; @@ -183,7 +188,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) */ static inline void __up_read(struct rw_semaphore *sem) { - __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; + rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS; asm volatile("# beginning __up_read\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* subtracts 1, returns the old value */ @@ -201,7 +206,7 @@ static inline void __up_read(struct rw_semaphore *sem) */ static inline void __up_write(struct rw_semaphore *sem) { - unsigned long tmp; + rwsem_count_t tmp; asm volatile("# beginning __up_write\n\t" LOCK_PREFIX " xadd %1,(%2)\n\t" /* tries to transition @@ -245,9 +250,9 @@ static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) /* * implement exchange and add functionality */ -static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(int delta, struct rw_semaphore *sem) { - int tmp = delta; + rwsem_count_t tmp = delta; asm volatile(LOCK_PREFIX "xadd %0,%1" : "+r" (tmp), "+m" (sem->count) From bafaecd11df15ad5b1e598adc7736afcd38ee13d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 12 Jan 2010 18:16:42 -0800 Subject: [PATCH 2/5] x86-64: support native xadd rwsem implementation This one is much faster than the spinlock based fallback rwsem code, with certain artifical benchmarks having shown 300%+ improvement on threaded page faults etc. Again, note the 32767-thread limit here. So this really does need that whole "make rwsem_count_t be 64-bit and fix the BIAS values to match" extension on top of it, but that is conceptually a totally independent issue. NOT TESTED! The original patch that this all was based on were tested by KAMEZAWA Hiroyuki, but maybe I screwed up something when I created the cleaned-up series, so caveat emptor.. Also note that it _may_ be a good idea to mark some more registers clobbered on x86-64 in the inline asms instead of saving/restoring them. They are inline functions, but they are only used in places where there are not a lot of live registers _anyway_, so doing for example the clobbers of %r8-%r11 in the asm wouldn't make the fast-path code any worse, and would make the slow-path code smaller. (Not that the slow-path really matters to that degree. Saving a few unnecessary registers is the _least_ of our problems when we hit the slow path. The instruction/cycle counting really only matters in the fast path). Signed-off-by: Linus Torvalds LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 2 +- arch/x86/lib/Makefile | 1 + arch/x86/lib/rwsem_64.S | 81 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 arch/x86/lib/rwsem_64.S diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 08e442bc3ab9..9d38a13b4ceb 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -319,7 +319,7 @@ config X86_L1_CACHE_SHIFT config X86_XADD def_bool y - depends on X86_32 && !M386 + depends on X86_64 || !M386 config X86_PPRO_FENCE bool "PentiumPro memory ordering errata workaround" diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index cffd754f3039..c80245131fdc 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -39,4 +39,5 @@ else lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o + lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o endif diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S new file mode 100644 index 000000000000..15acecf0d7aa --- /dev/null +++ b/arch/x86/lib/rwsem_64.S @@ -0,0 +1,81 @@ +/* + * x86-64 rwsem wrappers + * + * This interfaces the inline asm code to the slow-path + * C routines. We need to save the call-clobbered regs + * that the asm does not mark as clobbered, and move the + * argument from %rax to %rdi. + * + * NOTE! We don't need to save %rax, because the functions + * will always return the semaphore pointer in %rax (which + * is also the input argument to these helpers) + * + * The following can clobber %rdx because the asm clobbers it: + * call_rwsem_down_write_failed + * call_rwsem_wake + * but %rdi, %rsi, %rcx, %r8-r11 always need saving. + */ + +#include +#include +#include +#include +#include + +#define save_common_regs \ + pushq %rdi; \ + pushq %rsi; \ + pushq %rcx; \ + pushq %r8; \ + pushq %r9; \ + pushq %r10; \ + pushq %r11 + +#define restore_common_regs \ + popq %r11; \ + popq %r10; \ + popq %r9; \ + popq %r8; \ + popq %rcx; \ + popq %rsi; \ + popq %rdi + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_down_read_failed) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_down_read_failed + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_down_read_failed) + +ENTRY(call_rwsem_down_write_failed) + save_common_regs + movq %rax,%rdi + call rwsem_down_write_failed + restore_common_regs + ret + ENDPROC(call_rwsem_down_write_failed) + +ENTRY(call_rwsem_wake) + decw %dx /* do nothing if still outstanding active readers */ + jnz 1f + save_common_regs + movq %rax,%rdi + call rwsem_wake + restore_common_regs +1: ret + ENDPROC(call_rwsem_wake) + +/* Fix up special calling conventions */ +ENTRY(call_rwsem_downgrade_wake) + save_common_regs + pushq %rdx + movq %rax,%rdi + call rwsem_downgrade_wake + popq %rdx + restore_common_regs + ret + ENDPROC(call_rwsem_downgrade_wake) From 4126faf0ab7417fbc6eb99fb0fd407e01e9e9dfe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Jan 2010 10:24:07 -0800 Subject: [PATCH 3/5] x86: Fix breakage of UML from the changes in the rwsem system The patches 5d0b7235d83eefdafda300656e97d368afcafc9a and bafaecd11df15ad5b1e598adc7736afcd38ee13d broke the UML build: On Sun, 17 Jan 2010, Ingo Molnar wrote: > > FYI, -tip testing found that these changes break the UML build: > > kernel/built-in.o: In function `__up_read': > /home/mingo/tip/arch/x86/include/asm/rwsem.h:192: undefined reference to `call_rwsem_wake' > kernel/built-in.o: In function `__up_write': > /home/mingo/tip/arch/x86/include/asm/rwsem.h:210: undefined reference to `call_rwsem_wake' > kernel/built-in.o: In function `__downgrade_write': > /home/mingo/tip/arch/x86/include/asm/rwsem.h:228: undefined reference to `call_rwsem_downgrade_wake' > kernel/built-in.o: In function `__down_read': > /home/mingo/tip/arch/x86/include/asm/rwsem.h:112: undefined reference to `call_rwsem_down_read_failed' > kernel/built-in.o: In function `__down_write_nested': > /home/mingo/tip/arch/x86/include/asm/rwsem.h:154: undefined reference to `call_rwsem_down_write_failed' > collect2: ld returned 1 exit status Add lib/rwsem_64.o to the UML subarch objects to fix. LKML-Reference: Signed-off-by: H. Peter Anvin --- arch/um/sys-x86_64/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/um/sys-x86_64/Makefile b/arch/um/sys-x86_64/Makefile index 2201e9c20e4a..c1ea9eb04466 100644 --- a/arch/um/sys-x86_64/Makefile +++ b/arch/um/sys-x86_64/Makefile @@ -8,7 +8,8 @@ obj-y = bug.o bugs.o delay.o fault.o ldt.o mem.o ptrace.o ptrace_user.o \ setjmp.o signal.o stub.o stub_segv.o syscalls.o syscall_table.o \ sysrq.o ksyms.o tls.o -subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o +subarch-obj-y = lib/csum-partial_64.o lib/memcpy_64.o lib/thunk_64.o \ + lib/rwsem_64.o subarch-obj-$(CONFIG_MODULES) += kernel/module.o ldt-y = ../sys-i386/ldt.o From 1838ef1d782f7527e6defe87e180598622d2d071 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Jan 2010 14:00:34 -0800 Subject: [PATCH 4/5] x86-64, rwsem: 64-bit xadd rwsem implementation For x86-64, 32767 threads really is not enough. Change rwsem_count_t to a signed long, so that it is 64 bits on x86-64. This required the following changes to the assembly code: a) %z0 doesn't work on all versions of gcc! At least gcc 4.4.2 as shipped with Fedora 12 emits "ll" not "q" for 64 bits, even for integer operands. Newer gccs apparently do this correctly, but avoid this problem by using the _ASM_ macros instead of %z. b) 64 bits immediates are only allowed in "movq $imm,%reg" constructs... no others. Change some of the constraints to "e", and fix the one case where we would have had to use an invalid immediate -- in that case, we only care about the upper half anyway, so just access the upper half. Signed-off-by: H. Peter Anvin Cc: Linus Torvalds LKML-Reference: --- arch/x86/include/asm/rwsem.h | 53 +++++++++++++++++++++++++++--------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 5f9af3081d66..10204a25bf93 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -41,6 +41,7 @@ #include #include #include +#include struct rwsem_waiter; @@ -56,18 +57,24 @@ extern asmregparm struct rw_semaphore * /* * the semaphore definition * - * The bias values and the counter type needs to be extended to 64 bits - * if we want to have more than 32767 potential readers/writers + * The bias values and the counter type limits the number of + * potential readers/writers to 32767 for 32 bits and 2147483647 + * for 64 bits. */ -#define RWSEM_UNLOCKED_VALUE 0x00000000 -#define RWSEM_ACTIVE_BIAS 0x00000001 -#define RWSEM_ACTIVE_MASK 0x0000ffff -#define RWSEM_WAITING_BIAS (-0x00010000) +#ifdef CONFIG_X86_64 +# define RWSEM_ACTIVE_MASK 0xffffffffL +#else +# define RWSEM_ACTIVE_MASK 0x0000ffffL +#endif + +#define RWSEM_UNLOCKED_VALUE 0x00000000L +#define RWSEM_ACTIVE_BIAS 0x00000001L +#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1) #define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS #define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) -typedef signed int rwsem_count_t; +typedef signed long rwsem_count_t; struct rw_semaphore { rwsem_count_t count; @@ -110,7 +117,7 @@ do { \ static inline void __down_read(struct rw_semaphore *sem) { asm volatile("# beginning down_read\n\t" - LOCK_PREFIX " inc%z0 (%1)\n\t" + LOCK_PREFIX _ASM_INC "(%1)\n\t" /* adds 0x00000001, returns the old value */ " jns 1f\n" " call call_rwsem_down_read_failed\n" @@ -225,8 +232,25 @@ static inline void __up_write(struct rw_semaphore *sem) */ static inline void __downgrade_write(struct rw_semaphore *sem) { +#ifdef CONFIG_X86_64 +# if RWSEM_WAITING_BIAS != -0x100000000 +# error "This code assumes RWSEM_WAITING_BIAS == -2^32" +# endif + + /* 64-bit immediates are special and expensive, and not needed here */ asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX " add%z0 %2,(%1)\n\t" + LOCK_PREFIX "incl 4(%1)\n\t" + /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */ + " jns 1f\n\t" + " call call_rwsem_downgrade_wake\n" + "1:\n\t" + "# ending __downgrade_write\n" + : "+m" (sem->count) + : "a" (sem) + : "memory", "cc"); +#else + asm volatile("# beginning __downgrade_write\n\t" + LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" @@ -235,22 +259,25 @@ static inline void __downgrade_write(struct rw_semaphore *sem) : "+m" (sem->count) : "a" (sem), "i" (-RWSEM_WAITING_BIAS) : "memory", "cc"); +#endif } /* * implement atomic add functionality */ -static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) +static inline void rwsem_atomic_add(rwsem_count_t delta, + struct rw_semaphore *sem) { - asm volatile(LOCK_PREFIX "add%z0 %1,%0" + asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" : "+m" (sem->count) - : "ir" (delta)); + : "er" (delta)); } /* * implement exchange and add functionality */ -static inline rwsem_count_t rwsem_atomic_update(int delta, struct rw_semaphore *sem) +static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, + struct rw_semaphore *sem) { rwsem_count_t tmp = delta; From 0d1622d7f526311d87d7da2ee7dd14b73e45d3fc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 13 Feb 2010 10:33:12 +0200 Subject: [PATCH 5/5] x86-64, rwsem: Avoid store forwarding hazard in __downgrade_write The Intel Architecture Optimization Reference Manual states that a short load that follows a long store to the same object will suffer a store forwading penalty, particularly if the two accesses use different addresses. Trivially, a long load that follows a short store will also suffer a penalty. __downgrade_write() in rwsem incurs both penalties: the increment operation will not be able to reuse a recently-loaded rwsem value, and its result will not be reused by any recently-following rwsem operation. A comment in the code states that this is because 64-bit immediates are special and expensive; but while they are slightly special (only a single instruction allows them), they aren't expensive: a test shows that two loops, one loading a 32-bit immediate and one loading a 64-bit immediate, both take 1.5 cycles per iteration. Fix this by changing __downgrade_write to use the same add instruction on i386 and on x86_64, so that it uses the same operand size as all the other rwsem functions. Signed-off-by: Avi Kivity LKML-Reference: <1266049992-17419-1-git-send-email-avi@redhat.com> Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/rwsem.h | 25 +++++-------------------- 1 file changed, 5 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 10204a25bf93..606ede126972 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -232,34 +232,19 @@ static inline void __up_write(struct rw_semaphore *sem) */ static inline void __downgrade_write(struct rw_semaphore *sem) { -#ifdef CONFIG_X86_64 -# if RWSEM_WAITING_BIAS != -0x100000000 -# error "This code assumes RWSEM_WAITING_BIAS == -2^32" -# endif - - /* 64-bit immediates are special and expensive, and not needed here */ - asm volatile("# beginning __downgrade_write\n\t" - LOCK_PREFIX "incl 4(%1)\n\t" - /* transitions 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 */ - " jns 1f\n\t" - " call call_rwsem_downgrade_wake\n" - "1:\n\t" - "# ending __downgrade_write\n" - : "+m" (sem->count) - : "a" (sem) - : "memory", "cc"); -#else asm volatile("# beginning __downgrade_write\n\t" LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t" - /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ + /* + * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386) + * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64) + */ " jns 1f\n\t" " call call_rwsem_downgrade_wake\n" "1:\n\t" "# ending __downgrade_write\n" : "+m" (sem->count) - : "a" (sem), "i" (-RWSEM_WAITING_BIAS) + : "a" (sem), "er" (-RWSEM_WAITING_BIAS) : "memory", "cc"); -#endif } /*