Remove SSE hash, t's just too much effort to maintain.

This commit is contained in:
Dale Weiler 2013-12-14 17:30:51 -05:00
parent 31e13e6e64
commit f24bdced10

141
hash.c
View file

@ -261,149 +261,8 @@ static GMQCC_FORCEINLINE GMQCC_USED uint32_t hash_native(const void *GMQCC_RESTR
return hash_native_result(hash, carry, length);
}
/*
* Inline assembly optimized SSE version for when SSE is present via CPUID
* or the host compiler has __SSE__. This is about 16 cycles faster than
* native at -O2 for GCC and 11 cycles for -O3.
*
* Tested with -m32 on a Phenom II X4 with:
* gcc version 4.8.1 20130725 (prerelease) (GCC)
*/
#if defined(__GNUC__) && defined(__i386__)
static GMQCC_FORCEINLINE uint32_t hash_sse(const void *GMQCC_RESTRICT key, size_t length) {
uint32_t ret;
__asm__ __volatile__ (
" mov %%eax, %%ebx\n"
" mov %2, %%eax\n"
" movd %%eax, %%xmm7\n"
" shufps $0, %%xmm7, %%xmm7\n"
" mov %3, %%eax\n"
" movd %%eax, %%xmm6\n"
" shufps $0, %%xmm6, %%xmm6\n"
" lea (%%esi, %%ecx, 1), %%edi\n"
" jmp 2f\n"
"1:\n"
" movaps (%%esi), %%xmm0\n"
" pmulld %%xmm7, %%xmm0\n"
" movaps %%xmm0, %%xmm2\n"
" pslld $15, %%xmm0\n"
" psrld $17, %%xmm2\n"
" orps %%xmm2, %%xmm0\n"
" pmulld %%xmm6, %%xmm0\n"
" movd %%xmm0, %%eax\n"
" xor %%eax, %%ebx\n"
" rol $13, %%ebx\n"
" imul $5, %%ebx\n"
" add $0xE6546B64, %%ebx\n"
" shufps $0x39, %%xmm0, %%xmm0\n"
" movd %%xmm0, %%eax\n"
" xor %%eax, %%ebx\n"
" rol $13, %%ebx\n"
" imul $5, %%ebx\n"
" add $0xE6546B64, %%ebx\n"
" shufps $0x39, %%xmm0, %%xmm0\n"
" movd %%xmm0, %%eax\n"
" xor %%eax, %%ebx\n"
" rol $13, %%ebx\n"
" imul $5, %%ebx\n"
" add $0xE6546B64, %%ebx\n"
" shufps $0x39, %%xmm0, %%xmm0\n"
" movd %%xmm0, %%eax\n"
" xor %%eax, %%ebx\n"
" rol $13, %%ebx\n"
" imul $5, %%ebx\n"
" add $0xE6546B64, %%ebx\n"
" add $16, %%esi\n"
"2:\n"
" cmp %%esi, %%edi\n"
" jne 1b\n"
" xor %%ecx, %%ebx\n"
" mov %%ebx, %%eax\n"
" shr $16, %%ebx\n"
" xor %%ebx, %%eax\n"
" imul $0x85EBCA6b, %%eax\n"
" mov %%eax, %%ebx\n"
" shr $13, %%ebx\n"
" xor %%ebx, %%eax\n"
" imul $0xC2B2AE35, %%eax\n"
" mov %%eax, %%ebx\n"
" shr $16, %%ebx\n"
" xor %%ebx, %%eax\n"
: "=a" (ret)
: "a" (HASH_SEED),
"i" (HASH_MASK1),
"i" (HASH_MASK2),
"S" (key),
"c" (length)
: "%ebx",
"%edi"
);
return ret;
}
#endif
#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
/*
* Emulate MSVC _cpuid intrinsic for GCC/MinGW/Clang, this will be used
* to determine if we should use the SSE route.
*/
static GMQCC_FORCEINLINE void hash_cpuid(int *lanes, int entry) {
__asm__ __volatile__ (
"cpuid"
: "=a"(lanes[0]),
"=b"(lanes[1]),
"=c"(lanes[2]),
"=d"(lanes[3])
: "a" (entry)
);
}
#endif /* !(defined(__GNUC__) && defined(__i386__) */
static uint32_t hash_entry(const void *GMQCC_RESTRICT key, size_t length) {
/*
* No host SSE instruction set assumed do runtime test instead. This
* is for MinGW32 mostly which doesn't define SSE.
*/
#if defined (__GNUC__) && defined(__i386__) && !defined(__SSE__)
static bool memoize = false;
static bool sse = false;
if (GMQCC_UNLIKELY(!memoize)) {
/*
* Only calculate SSE one time, thus it's unlikely that this branch
* is taken more than once.
*/
static int lanes[4];
hash_cpuid(lanes, 0);
/*
* It's very likely that lanes[0] will contain a value unless it
* isn't a modern x86.
*/
if (GMQCC_LIKELY(*lanes >= 1))
sse = (lanes[3] & ((int)1 << 25)) != 0;
memoize = true;
}
return (GMQCC_LIKELY(sse))
? hash_sse(key, length)
: hash_native(key, length);
/*
* Same as above but this time host compiler was defined with SSE support.
* This handles MinGW32 builds for i686+
*/
#elif defined (__GNUC__) && defined(__i386__) && defined(__SSE__)
return hash_sse(key, length);
#else
/*
* Go the native route which itself is highly optimized as well for
* unaligned load/store when dealing with LE.
*/
return hash_native(key, length);
#endif
}
#define HASH_LEN_ALIGN (sizeof(size_t))