diff options
Diffstat (limited to 'lcs_sse.c')
-rw-r--r-- | lcs_sse.c | 158 |
1 files changed, 158 insertions, 0 deletions
@@ -0,0 +1,158 @@ +#ifndef __SSE3__ +# ifdef __APPLE__ +# include <libkern/OSByteOrder.h> +# define bswap_16 OSSwapInt16 +# define bswap_32 OSSwapInt32 +# define bswap_64 OSSwapInt64 +# else +# include <byteswap.h> +# endif +#endif + +#define LBLOCK(n) \ + "movapd %%xmm0, %%xmm2\n\t" \ + "pslldq $" #n ", %%xmm2\n\t" \ + "pcmpeqb %%xmm1, %%xmm2\n\t" \ + "psrldq $" #n ", %%xmm2\n\t" \ + "pmovmskb %%xmm2, %%ecx\n\t" \ + "mov %%ecx, %%edx\n\t" \ + "shl $1, %%edx\n\t" \ + "and %%edx, %%ecx\n\t" \ + "shl $1, %%edx\n\t" \ + "and %%edx, %%ecx\n\t" \ + "or %%ecx, %0\n\t" \ + +#define RBLOCK(n) \ + "movapd %%xmm1, %%xmm2\n\t" \ + "pslldq $" #n ", %%xmm2\n\t" \ + "pcmpeqb %%xmm0, %%xmm2\n\t" \ + /* necessary to avoid a glitch */ \ + "psrldq $" #n ", %%xmm2\n\t" \ + "pmovmskb %%xmm2, %%ecx\n\t" \ + "mov %%ecx, %%edx\n\t" \ + "shl $1, %%edx\n\t" \ + "and %%edx, %%ecx\n\t" \ + "shl $1, %%edx\n\t" \ + "and %%edx, %%ecx\n\t" \ + "or %%ecx, %0\n\t" \ + +#ifdef USE_LCS5 + +int lcs_32_rough(const uint32_t * const str1, const uint32_t * const str2) { + register unsigned int x; + +#ifndef __SSE3__ + static uint32_t buf1[4] __attribute__((aligned(16))); + static uint32_t buf2[4] __attribute__((aligned(16))); + size_t i; + for (i=0; i<4; i++) { + buf1[i^3] = bswap_32(str1[i]); + buf2[i^3] = bswap_32(str2[i]); + } +#else + /* this is an endian swap, for use with pshufb */ + static uint8_t mask[16] = {0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, + 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00}; + + const uint32_t * const buf1 = str1; + const uint32_t * const buf2 = str2; +#endif + + /* the pointers are used in the asm below, but there aren't enough + registers in 32-bit mode to make this work as a constraint on + the __asm__ itself. This does the equivalent to prevent reordering, + while leaving two extra GPRs. */ + /*volatile int foo1 = *str1; + volatile int foo2 = *str2;*/ + __asm__ volatile("" : : "m"(*str1), "m"(*str2)); + + __asm__ volatile ( + /* moving unmodified ones into place */ + "movapd (%1), %%xmm0\n\t" + "movapd (%2), %%xmm1\n\t" +#ifdef __SSE3__ + /* endian swap */ + "pshufb (%3), %%xmm0\n\t" + "pshufb (%3), %%xmm1\n\t" +#endif + /* clear the accumulator */ + "xor %0, %0\n\t" + + LBLOCK(0) + LBLOCK(1) + LBLOCK(2) + LBLOCK(3) + LBLOCK(4) + LBLOCK(5) + LBLOCK(6) + LBLOCK(7) + LBLOCK(8) + LBLOCK(9) + LBLOCK(10) + LBLOCK(11) + LBLOCK(12) + + /* no RBLOCK(0) because it's equivalent to LBLOCK(0) */ + RBLOCK(1) + RBLOCK(2) + RBLOCK(3) + RBLOCK(4) + RBLOCK(5) + RBLOCK(6) + RBLOCK(7) + RBLOCK(8) + RBLOCK(9) + RBLOCK(10) + RBLOCK(11) + RBLOCK(12) + + /* This is crazy, but the shortest way I know of to shift xmm by 4 bits. */ + "movapd %%xmm0, %%xmm2\n\t" + "psllq $4, %%xmm0\n\t" + "psrlq $60, %%xmm2\n\t" + /* Shuffle the xmm around (in quarters) -- xmm2 has the 01 or 11 as + * zero due to the shift right by 60, so we can use that for the + * three we don't care about -- the 69 constant = + * [z] [0] [z] [z], where z = 01 or 11, 0b01000101 */ + "pshufd $69, %%xmm2, %%xmm2\n\t" + /* most operations work here since the same bit position can only be + * 1 in one register. add, or or xor ok. */ + "pxor %%xmm2, %%xmm0\n\t" + + LBLOCK(0) + LBLOCK(1) + LBLOCK(2) + LBLOCK(3) + LBLOCK(4) + LBLOCK(5) + LBLOCK(6) + LBLOCK(7) + LBLOCK(8) + LBLOCK(9) + LBLOCK(10) + LBLOCK(11) + LBLOCK(12) + + RBLOCK(1) + RBLOCK(2) + RBLOCK(3) + RBLOCK(4) + RBLOCK(5) + RBLOCK(6) + RBLOCK(7) + RBLOCK(8) + RBLOCK(9) + RBLOCK(10) + RBLOCK(11) + RBLOCK(12) + "1:" + : "=r"(x) : "r"(buf1), "r"(buf2) +#ifdef __SSE3__ +, "r"(mask) +#endif +: "xmm0", "xmm1", "xmm2", "xmm3", "edx", "ecx"); + //printf("ret %u\n", x); + return ((x != 0) ? 8 : 0); +} + +#endif |