1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
|
#ifndef __SSE3__
# ifdef __APPLE__
# include <libkern/OSByteOrder.h>
# define bswap_16 OSSwapInt16
# define bswap_32 OSSwapInt32
# define bswap_64 OSSwapInt64
# else
# include <byteswap.h>
# endif
#endif
#define LBLOCK(n) \
"movapd %%xmm0, %%xmm2\n\t" \
"pslldq $" #n ", %%xmm2\n\t" \
"pcmpeqb %%xmm1, %%xmm2\n\t" \
"psrldq $" #n ", %%xmm2\n\t" \
"pmovmskb %%xmm2, %%ecx\n\t" \
"mov %%ecx, %%edx\n\t" \
"shl $1, %%edx\n\t" \
"and %%edx, %%ecx\n\t" \
"shl $1, %%edx\n\t" \
"and %%edx, %%ecx\n\t" \
"or %%ecx, %0\n\t" \
#define RBLOCK(n) \
"movapd %%xmm1, %%xmm2\n\t" \
"pslldq $" #n ", %%xmm2\n\t" \
"pcmpeqb %%xmm0, %%xmm2\n\t" \
/* necessary to avoid a glitch */ \
"psrldq $" #n ", %%xmm2\n\t" \
"pmovmskb %%xmm2, %%ecx\n\t" \
"mov %%ecx, %%edx\n\t" \
"shl $1, %%edx\n\t" \
"and %%edx, %%ecx\n\t" \
"shl $1, %%edx\n\t" \
"and %%edx, %%ecx\n\t" \
"or %%ecx, %0\n\t" \
int lcs_32_rough_5(const uint32_t * const str1, const uint32_t * const str2) {
register unsigned int x;
#ifndef __SSE3__
static uint32_t buf1[4] __attribute__((aligned(16)));
static uint32_t buf2[4] __attribute__((aligned(16)));
size_t i;
for (i=0; i<4; i++) {
buf1[i^3] = bswap_32(str1[i]);
buf2[i^3] = bswap_32(str2[i]);
}
#else
/* this is an endian swap, for use with pshufb */
static uint8_t mask[16] = {0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09,
0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00};
const uint32_t * const buf1 = str1;
const uint32_t * const buf2 = str2;
#endif
/* the pointers are used in the asm below, but there aren't enough
registers in 32-bit mode to make this work as a constraint on
the __asm__ itself. This does the equivalent to prevent reordering,
while leaving two extra GPRs. */
/*volatile int foo1 = *str1;
volatile int foo2 = *str2;*/
__asm__ volatile("" : : "m"(*str1), "m"(*str2));
__asm__ volatile (
/* moving unmodified ones into place */
"movapd (%1), %%xmm0\n\t"
"movapd (%2), %%xmm1\n\t"
#ifdef __SSE3__
/* endian swap */
"pshufb (%3), %%xmm0\n\t"
"pshufb (%3), %%xmm1\n\t"
#endif
/* clear the accumulator */
"xor %0, %0\n\t"
LBLOCK(0)
LBLOCK(1)
LBLOCK(2)
LBLOCK(3)
LBLOCK(4)
LBLOCK(5)
LBLOCK(6)
LBLOCK(7)
LBLOCK(8)
LBLOCK(9)
LBLOCK(10)
LBLOCK(11)
LBLOCK(12)
/* no RBLOCK(0) because it's equivalent to LBLOCK(0) */
RBLOCK(1)
RBLOCK(2)
RBLOCK(3)
RBLOCK(4)
RBLOCK(5)
RBLOCK(6)
RBLOCK(7)
RBLOCK(8)
RBLOCK(9)
RBLOCK(10)
RBLOCK(11)
RBLOCK(12)
/* This is crazy, but the shortest way I know of to shift xmm by 4 bits. */
"movapd %%xmm0, %%xmm2\n\t"
"psllq $4, %%xmm0\n\t"
"psrlq $60, %%xmm2\n\t"
/* Shuffle the xmm around (in quarters) -- xmm2 has the 01 or 11 as
* zero due to the shift right by 60, so we can use that for the
* three we don't care about -- the 69 constant =
* [z] [0] [z] [z], where z = 01 or 11, 0b01000101 */
"pshufd $69, %%xmm2, %%xmm2\n\t"
/* most operations work here since the same bit position can only be
* 1 in one register. add, or or xor ok. */
"pxor %%xmm2, %%xmm0\n\t"
LBLOCK(0)
LBLOCK(1)
LBLOCK(2)
LBLOCK(3)
LBLOCK(4)
LBLOCK(5)
LBLOCK(6)
LBLOCK(7)
LBLOCK(8)
LBLOCK(9)
LBLOCK(10)
LBLOCK(11)
LBLOCK(12)
RBLOCK(1)
RBLOCK(2)
RBLOCK(3)
RBLOCK(4)
RBLOCK(5)
RBLOCK(6)
RBLOCK(7)
RBLOCK(8)
RBLOCK(9)
RBLOCK(10)
RBLOCK(11)
RBLOCK(12)
"1:"
: "=r"(x) : "r"(buf1), "r"(buf2)
#ifdef __SSE3__
, "r"(mask)
#endif
: "xmm0", "xmm1", "xmm2", "xmm3", "edx", "ecx");
//printf("ret %u\n", x);
return ((x != 0) ? 8 : 0);
}
|