[PATCH] optimize hweight64 for x86_64
Based on patch from David Rientjes <rientjes@google.com>, but changed by AK. Optimizes the 64-bit hamming weight for x86_64 processors assuming they have fast multiplication. Uses five fewer bitops than the generic hweight64. Benchmark on one EMT64 showed ~25% speedup with 2^24 consecutive calls. Define a new ARCH_HAS_FAST_MULTIPLIER that can be set by other architectures that can also multiply fast. Signed-off-by: Andi Kleen <ak@suse.de>
This commit is contained in:
parent
8380aabb99
commit
0136611c62
2 changed files with 10 additions and 2 deletions
|
@ -399,6 +399,8 @@ static __inline__ int fls(int x)
|
|||
return r+1;
|
||||
}
|
||||
|
||||
#define ARCH_HAS_FAST_MULTIPLIER 1
|
||||
|
||||
#include <asm-generic/bitops/hweight.h>
|
||||
|
||||
#endif /* __KERNEL__ */
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include <linux/module.h>
|
||||
#include <asm/types.h>
|
||||
#include <asm/bitops.h>
|
||||
|
||||
/**
|
||||
* hweightN - returns the hamming weight of a N-bit word
|
||||
|
@ -40,14 +41,19 @@ unsigned long hweight64(__u64 w)
|
|||
#if BITS_PER_LONG == 32
|
||||
return hweight32((unsigned int)(w >> 32)) + hweight32((unsigned int)w);
|
||||
#elif BITS_PER_LONG == 64
|
||||
#ifdef ARCH_HAS_FAST_MULTIPLIER
|
||||
w -= (w >> 1) & 0x5555555555555555ul;
|
||||
w = (w & 0x3333333333333333ul) + ((w >> 2) & 0x3333333333333333ul);
|
||||
w = (w + (w >> 4)) & 0x0f0f0f0f0f0f0f0ful;
|
||||
return (w * 0x0101010101010101ul) >> 56;
|
||||
#else
|
||||
__u64 res = w - ((w >> 1) & 0x5555555555555555ul);
|
||||
res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
|
||||
res = (res + (res >> 4)) & 0x0F0F0F0F0F0F0F0Ful;
|
||||
res = res + (res >> 8);
|
||||
res = res + (res >> 16);
|
||||
return (res + (res >> 32)) & 0x00000000000000FFul;
|
||||
#else
|
||||
#error BITS_PER_LONG not defined
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL(hweight64);
|
||||
|
|
Loading…
Reference in a new issue