summaryrefslogtreecommitdiff
path: root/lcs_sse.c
diff options
context:
space:
mode:
Diffstat (limited to 'lcs_sse.c')
-rw-r--r--lcs_sse.c158
1 files changed, 158 insertions, 0 deletions
diff --git a/lcs_sse.c b/lcs_sse.c
index b748b01..487aef2 100644
--- a/lcs_sse.c
+++ b/lcs_sse.c
@@ -0,0 +1,158 @@
+#ifndef __SSE3__
+# ifdef __APPLE__
+# include <libkern/OSByteOrder.h>
+# define bswap_16 OSSwapInt16
+# define bswap_32 OSSwapInt32
+# define bswap_64 OSSwapInt64
+# else
+# include <byteswap.h>
+# endif
+#endif
+
+#define LBLOCK(n) \
+ "movapd %%xmm0, %%xmm2\n\t" \
+ "pslldq $" #n ", %%xmm2\n\t" \
+ "pcmpeqb %%xmm1, %%xmm2\n\t" \
+ "psrldq $" #n ", %%xmm2\n\t" \
+ "pmovmskb %%xmm2, %%ecx\n\t" \
+ "mov %%ecx, %%edx\n\t" \
+ "shl $1, %%edx\n\t" \
+ "and %%edx, %%ecx\n\t" \
+ "shl $1, %%edx\n\t" \
+ "and %%edx, %%ecx\n\t" \
+ "or %%ecx, %0\n\t" \
+
+#define RBLOCK(n) \
+ "movapd %%xmm1, %%xmm2\n\t" \
+ "pslldq $" #n ", %%xmm2\n\t" \
+ "pcmpeqb %%xmm0, %%xmm2\n\t" \
+ /* necessary to avoid a glitch */ \
+ "psrldq $" #n ", %%xmm2\n\t" \
+ "pmovmskb %%xmm2, %%ecx\n\t" \
+ "mov %%ecx, %%edx\n\t" \
+ "shl $1, %%edx\n\t" \
+ "and %%edx, %%ecx\n\t" \
+ "shl $1, %%edx\n\t" \
+ "and %%edx, %%ecx\n\t" \
+ "or %%ecx, %0\n\t" \
+
+#ifdef USE_LCS5
+
+int lcs_32_rough(const uint32_t * const str1, const uint32_t * const str2) {
+ register unsigned int x;
+
+#ifndef __SSE3__
+ static uint32_t buf1[4] __attribute__((aligned(16)));
+ static uint32_t buf2[4] __attribute__((aligned(16)));
+ size_t i;
+ for (i=0; i<4; i++) {
+ buf1[i^3] = bswap_32(str1[i]);
+ buf2[i^3] = bswap_32(str2[i]);
+ }
+#else
+ /* this is an endian swap, for use with pshufb */
+ static uint8_t mask[16] = {0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
+ 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00};
+
+ const uint32_t * const buf1 = str1;
+ const uint32_t * const buf2 = str2;
+#endif
+
+ /* the pointers are used in the asm below, but there aren't enough
+ registers in 32-bit mode to make this work as a constraint on
+ the __asm__ itself. This does the equivalent to prevent reordering,
+ while leaving two extra GPRs. */
+ /*volatile int foo1 = *str1;
+ volatile int foo2 = *str2;*/
+ __asm__ volatile("" : : "m"(*str1), "m"(*str2));
+
+ __asm__ volatile (
+ /* moving unmodified ones into place */
+ "movapd (%1), %%xmm0\n\t"
+ "movapd (%2), %%xmm1\n\t"
+#ifdef __SSE3__
+ /* endian swap */
+ "pshufb (%3), %%xmm0\n\t"
+ "pshufb (%3), %%xmm1\n\t"
+#endif
+ /* clear the accumulator */
+ "xor %0, %0\n\t"
+
+ LBLOCK(0)
+ LBLOCK(1)
+ LBLOCK(2)
+ LBLOCK(3)
+ LBLOCK(4)
+ LBLOCK(5)
+ LBLOCK(6)
+ LBLOCK(7)
+ LBLOCK(8)
+ LBLOCK(9)
+ LBLOCK(10)
+ LBLOCK(11)
+ LBLOCK(12)
+
+ /* no RBLOCK(0) because it's equivalent to LBLOCK(0) */
+ RBLOCK(1)
+ RBLOCK(2)
+ RBLOCK(3)
+ RBLOCK(4)
+ RBLOCK(5)
+ RBLOCK(6)
+ RBLOCK(7)
+ RBLOCK(8)
+ RBLOCK(9)
+ RBLOCK(10)
+ RBLOCK(11)
+ RBLOCK(12)
+
+ /* This is crazy, but the shortest way I know of to shift xmm by 4 bits. */
+ "movapd %%xmm0, %%xmm2\n\t"
+ "psllq $4, %%xmm0\n\t"
+ "psrlq $60, %%xmm2\n\t"
+ /* Shuffle the xmm around (in quarters) -- xmm2 has the 01 or 11 as
+ * zero due to the shift right by 60, so we can use that for the
+ * three we don't care about -- the 69 constant =
+ * [z] [0] [z] [z], where z = 01 or 11, 0b01000101 */
+ "pshufd $69, %%xmm2, %%xmm2\n\t"
+ /* most operations work here since the same bit position can only be
+ * 1 in one register. add, or or xor ok. */
+ "pxor %%xmm2, %%xmm0\n\t"
+
+ LBLOCK(0)
+ LBLOCK(1)
+ LBLOCK(2)
+ LBLOCK(3)
+ LBLOCK(4)
+ LBLOCK(5)
+ LBLOCK(6)
+ LBLOCK(7)
+ LBLOCK(8)
+ LBLOCK(9)
+ LBLOCK(10)
+ LBLOCK(11)
+ LBLOCK(12)
+
+ RBLOCK(1)
+ RBLOCK(2)
+ RBLOCK(3)
+ RBLOCK(4)
+ RBLOCK(5)
+ RBLOCK(6)
+ RBLOCK(7)
+ RBLOCK(8)
+ RBLOCK(9)
+ RBLOCK(10)
+ RBLOCK(11)
+ RBLOCK(12)
+ "1:"
+ : "=r"(x) : "r"(buf1), "r"(buf2)
+#ifdef __SSE3__
+, "r"(mask)
+#endif
+: "xmm0", "xmm1", "xmm2", "xmm3", "edx", "ecx");
+ //printf("ret %u\n", x);
+ return ((x != 0) ? 8 : 0);
+}
+
+#endif