1 files changed, 158 insertions, 0 deletions
diff --git a/lcs_sse.c b/lcs_sse.c
index b748b01..487aef2 100644
--- a/lcs_sse.c
+++ b/lcs_sse.c
@@ -0,0 +1,158 @@
+#ifndef __SSE3__
+# ifdef __APPLE__
+#  include <libkern/OSByteOrder.h>
+#  define bswap_16 OSSwapInt16
+#  define bswap_32 OSSwapInt32
+#  define bswap_64 OSSwapInt64
+# else
+#  include <byteswap.h>
+# endif
+#endif
+
+#define LBLOCK(n) \
+        "movapd %%xmm0, %%xmm2\n\t" \
+        "pslldq $" #n ", %%xmm2\n\t" \
+        "pcmpeqb %%xmm1, %%xmm2\n\t" \
+        "psrldq $" #n ", %%xmm2\n\t" \
+        "pmovmskb %%xmm2, %%ecx\n\t" \
+        "mov %%ecx, %%edx\n\t" \
+        "shl $1, %%edx\n\t" \
+        "and %%edx, %%ecx\n\t" \
+        "shl $1, %%edx\n\t" \
+        "and %%edx, %%ecx\n\t" \
+        "or %%ecx, %0\n\t" \
+
+#define RBLOCK(n) \
+        "movapd %%xmm1, %%xmm2\n\t" \
+        "pslldq $" #n ", %%xmm2\n\t" \
+        "pcmpeqb %%xmm0, %%xmm2\n\t" \
+        /* necessary to avoid a glitch */ \
+        "psrldq $" #n ", %%xmm2\n\t" \
+        "pmovmskb %%xmm2, %%ecx\n\t" \
+        "mov %%ecx, %%edx\n\t" \
+        "shl $1, %%edx\n\t" \
+        "and %%edx, %%ecx\n\t" \
+        "shl $1, %%edx\n\t" \
+        "and %%edx, %%ecx\n\t" \
+        "or %%ecx, %0\n\t" \
+
+#ifdef USE_LCS5
+
+int lcs_32_rough(const uint32_t * const str1, const uint32_t * const str2) {
+    register unsigned int x;
+
+#ifndef __SSE3__
+    static uint32_t buf1[4] __attribute__((aligned(16)));
+    static uint32_t buf2[4] __attribute__((aligned(16)));
+    size_t i;
+    for (i=0; i<4; i++) {
+        buf1[i^3] = bswap_32(str1[i]);
+        buf2[i^3] = bswap_32(str2[i]);
+    }
+#else
+    /* this is an endian swap, for use with pshufb */
+    static uint8_t mask[16] = {0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
+        0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00};
+
+    const uint32_t * const buf1 = str1;
+    const uint32_t * const buf2 = str2;
+#endif
+
+    /* the pointers are used in the asm below, but there aren't enough
+       registers in 32-bit mode to make this work as a constraint on
+       the __asm__ itself. This does the equivalent to prevent reordering,
+       while leaving two extra GPRs. */
+    /*volatile int foo1 = *str1;
+    volatile int foo2 = *str2;*/
+    __asm__ volatile("" : : "m"(*str1), "m"(*str2));
+
+    __asm__ volatile (
+        /* moving unmodified ones into place */
+        "movapd (%1), %%xmm0\n\t"
+        "movapd (%2), %%xmm1\n\t"
+#ifdef __SSE3__
+        /* endian swap */
+        "pshufb (%3), %%xmm0\n\t"
+        "pshufb (%3), %%xmm1\n\t"
+#endif
+        /* clear the accumulator */
+        "xor %0, %0\n\t"
+
+        LBLOCK(0)
+        LBLOCK(1)
+        LBLOCK(2)
+        LBLOCK(3)
+        LBLOCK(4)
+        LBLOCK(5)
+        LBLOCK(6)
+        LBLOCK(7)
+        LBLOCK(8)
+        LBLOCK(9)
+        LBLOCK(10)
+        LBLOCK(11)
+        LBLOCK(12)
+
+        /* no RBLOCK(0) because it's equivalent to LBLOCK(0) */
+        RBLOCK(1)
+        RBLOCK(2)
+        RBLOCK(3)
+        RBLOCK(4)
+        RBLOCK(5)
+        RBLOCK(6)
+        RBLOCK(7)
+        RBLOCK(8)
+        RBLOCK(9)
+        RBLOCK(10)
+        RBLOCK(11)
+        RBLOCK(12)
+
+        /* This is crazy, but the shortest way I know of to shift xmm by 4 bits. */
+        "movapd %%xmm0, %%xmm2\n\t"
+        "psllq $4, %%xmm0\n\t"
+        "psrlq $60, %%xmm2\n\t"
+        /* Shuffle the xmm around (in quarters) -- xmm2 has the 01 or 11 as
+         * zero due to the shift right by 60, so we can use that for the
+         * three we don't care about -- the 69 constant =
+         * [z] [0] [z] [z], where z = 01 or 11, 0b01000101 */
+        "pshufd $69, %%xmm2, %%xmm2\n\t"
+        /* most operations work here since the same bit position can only be
+         * 1 in one register.  add, or or xor ok. */
+        "pxor %%xmm2, %%xmm0\n\t"
+
+        LBLOCK(0)
+        LBLOCK(1)
+        LBLOCK(2)
+        LBLOCK(3)
+        LBLOCK(4)
+        LBLOCK(5)
+        LBLOCK(6)
+        LBLOCK(7)
+        LBLOCK(8)
+        LBLOCK(9)
+        LBLOCK(10)
+        LBLOCK(11)
+        LBLOCK(12)
+
+        RBLOCK(1)
+        RBLOCK(2)
+        RBLOCK(3)
+        RBLOCK(4)
+        RBLOCK(5)
+        RBLOCK(6)
+        RBLOCK(7)
+        RBLOCK(8)
+        RBLOCK(9)
+        RBLOCK(10)
+        RBLOCK(11)
+        RBLOCK(12)
+        "1:"
+        : "=r"(x) : "r"(buf1), "r"(buf2)
+#ifdef __SSE3__
+, "r"(mask)
+#endif
+: "xmm0", "xmm1", "xmm2", "xmm3", "edx", "ecx");
+    //printf("ret %u\n", x);
+    return ((x != 0) ? 8 : 0);
+}
+
+#endif