summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEric Anderson <ejona86@gmail.com>2010-12-26 08:47:02 -0600
committerEric Anderson <ejona86@gmail.com>2010-12-26 08:47:02 -0600
commitfbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7 (patch)
tree23ced470bd9f9b4357429ba5b1a134337a2f6d1d
parentb20c3c2c8468cacc6b0b73b0c15cf1cbb7ced23a (diff)
downloadmd5game-fbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7.tar.gz
md5game-fbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7.zip
Add lcs_upper_bound check before lcs_32_rough for added speed
-rw-r--r--lcs.c217
-rw-r--r--runner.c291
2 files changed, 508 insertions, 0 deletions
diff --git a/lcs.c b/lcs.c
index 2a9de47..3b1f7b3 100644
--- a/lcs.c
+++ b/lcs.c
@@ -0,0 +1,217 @@
+/* From wikibooks, longest common substring with fixed size = 32.
+ Don't think curr and prev need to be ints.
+ */
+#include <stdlib.h>
+#include <string.h>
+
+
+unsigned char tbuf[32] __attribute__((aligned(32)));
+#if 0
+unsigned char tbuf2[32] __attribute__((aligned(32)));
+#endif
+
+/* TYPE must be a single word; use typedef to make it so */
+typedef unsigned int uint32;
+/* LENGTH1 must be >= LENGTH2 */
+
+#define LCS_MEMDEF(LENGTH) \
+unsigned char counts##LENGTH##_1[LENGTH] __attribute__((aligned(32))); \
+unsigned char counts##LENGTH##_2[LENGTH] __attribute__((aligned(32)));
+
+#define LCS(TYPE, LENGTH1, LENGTH2) \
+int lcs_##TYPE##_##LENGTH1##_##LENGTH2(TYPE *str1, TYPE *str2) { \
+ unsigned char *curr = counts##LENGTH1##_1; \
+ unsigned char *prev = counts##LENGTH1##_2; \
+ unsigned char *t = NULL; \
+ unsigned char maxSubstr = 0; \
+ \
+ unsigned char i, j; \
+ /* i index into str1, j index into str2 */ \
+ /* i=0 unrolled from the main loop */ \
+ TYPE c1 = str1[0]; \
+ for(j = 0; j<LENGTH2/sizeof(TYPE); ++j) \
+ { \
+ if (c1 == str2[j]){ \
+ prev[j] = 1; \
+ maxSubstr = 1; \
+ } else { \
+ prev[j] = 0; \
+ } \
+ } \
+ for(i = 1; i<LENGTH1/sizeof(TYPE); ++i) \
+ { \
+ TYPE c = str1[i]; \
+ if(c == str2[0]){ /* j=0 unrolled from loop */ \
+ curr[0] = 1; \
+ if (!maxSubstr) { \
+ maxSubstr = 1; \
+ } \
+ } else { \
+ curr[0] = 0; \
+ } \
+ for(j = 1; j<LENGTH2/sizeof(TYPE); ++j) \
+ { \
+ if(c != str2[j]) \
+ { \
+ curr[j] = 0; \
+ } \
+ else \
+ { \
+ curr[j] = 1 + prev[j-1]; \
+ /* The next if can be replaced with: */ \
+ /* maxSubstr = max(maxSubstr, table[i][j]); */ \
+ /* (You need algorithm.h library for using max()) */ \
+ if(maxSubstr < curr[j]) \
+ { \
+ maxSubstr = curr[j]; \
+ } \
+ } \
+ } \
+ t = prev; \
+ prev = curr; \
+ curr = t; \
+ } \
+ return maxSubstr; \
+}
+
+/* simplistic MAX macro, easily misused */
+#define MAX(x,y) ((x>y)?x:y)
+
+LCS_MEMDEF(32)
+
+/* Select one of the rough lcs implementations below */
+
+#if 0 // Yields similar perf to the unsigned int version, but has a much larger error
+#define LCS_ERROR 14
+typedef unsigned long long ull;
+LCS(ull, 32, 32)
+LCS(ull, 32, 24)
+int lcs_32_rough(unsigned char *str1, unsigned char *str2) {
+ int max_length = 0;
+ int max_length_next = 0;
+ int offset;
+ max_length = lcs_ull_32_32((unsigned long long*)str1, (unsigned long long*)str2);
+ for (offset=1; offset < sizeof(unsigned long long); offset++) {
+ memcpy(tbuf, str2+offset, 24);
+ max_length_next = lcs_ull_32_24((unsigned long long*)str1, (unsigned long long*)tbuf);
+ max_length = MAX(max_length, max_length_next);
+ }
+ return max_length;
+}
+#endif
+
+#if 1 // Yields 1120K/s
+#define LCS_ERROR 6
+LCS(uint32, 32, 32)
+LCS(uint32, 32, 28)
+
+
+#ifdef USE_LCS4
+int lcs_32_rough(unsigned char *str1, unsigned char *str2) {
+ int max_length = 0;
+ int max_length_next = 0;
+ int offset;
+ max_length = lcs_uint32_32_32((unsigned int*)str1, (unsigned int*)str2);
+ for (offset=1; offset < sizeof(unsigned int); offset++) {
+ memcpy(tbuf, str2+offset, 28);
+ max_length_next = lcs_uint32_32_28((unsigned int*)str1, (unsigned int*)tbuf);
+ max_length = MAX(max_length, max_length_next);
+ }
+ return max_length;
+}
+#endif
+
+#endif
+
+#if 0 // 645K/s w/error=1, 859K/s w/error=2
+typedef unsigned short ushort;
+LCS_MEMDEF(30)
+LCS(ushort, 32, 32)
+LCS(ushort, 32, 30)
+LCS(ushort, 30, 30)
+
+/* May under-count by 2 */
+int lcs_32_rough(unsigned char *str1, unsigned char *str2) {
+ int max_length = 0;
+ int max_length_next = 0;
+ int offset;
+ max_length = lcs_ushort_32_32((unsigned short*)str1, (unsigned short*)str2);
+
+ memcpy(tbuf, str2+1, 30);
+ max_length_next = lcs_ushort_32_30((unsigned short*)str1, (unsigned short*)tbuf);
+ max_length = MAX(max_length, max_length_next);
+
+#if 0 /* Removes the off-by-two errors, yields Benchmark: 645/s */
+#define LCS_ERROR 1
+ memcpy(tbuf2, str1+1, 30);
+ max_length_next = lcs_ushort_32_30((unsigned short*)str2, (unsigned short*)tbuf2);
+ max_length = MAX(max_length, max_length_next);
+
+ max_length_next = lcs_ushort_32_30((unsigned short*)tbuf, (unsigned short*)tbuf2);
+ max_length = MAX(max_length, max_length_next);
+#else /* yields Benchmark: 859K/s */
+#define LCS_ERROR 2
+#endif
+
+ return max_length;
+}
+#endif
+
+#ifdef USE_PRE_LCS
+#include "arpa/inet.h"
+int lcs_upper_bound(const uint32_t * const str1_int, const uint32_t * const str2_int) {
+ static char contains[256] = {0};
+ uint32_t bo_str2_int[4];
+ uint32_t shifted_str2_int[4];
+ const unsigned char * const str1 = (const unsigned char * const)str1_int;
+ const unsigned char * const str2 = (const unsigned char * const)str2_int;
+ const unsigned char * const str2_r = (const unsigned char * const)shifted_str2_int;
+ int first_sum, second_sum;
+
+ bo_str2_int[0] = ntohl(str2_int[0]);
+ bo_str2_int[1] = ntohl(str2_int[1]);
+ bo_str2_int[2] = ntohl(str2_int[2]);
+ bo_str2_int[3] = ntohl(str2_int[3]);
+
+ shifted_str2_int[0] = (bo_str2_int[0] << 4) | (bo_str2_int[1] >> 28);
+ shifted_str2_int[1] = (bo_str2_int[1] << 4) | (bo_str2_int[2] >> 28);
+ shifted_str2_int[2] = (bo_str2_int[2] << 4) | (bo_str2_int[3] >> 28);
+ shifted_str2_int[3] = (bo_str2_int[3] << 4) | (bo_str2_int[0] >> 28);
+
+ contains[str1[0]] = contains[str1[1]] = contains[str1[2]]
+ = contains[str1[3]] = contains[str1[4]] = contains[str1[5]]
+ = contains[str1[6]] = contains[str1[7]] = contains[str1[8]]
+ = contains[str1[9]] = contains[str1[10]] = contains[str1[11]]
+ = contains[str1[12]] = contains[str1[13]] = contains[str1[14]]
+ = contains[str1[15]] = 1;
+
+ first_sum = contains[str2[0]] + contains[str2[1]] + contains[str2[2]]
+ + contains[str2[3]] + contains[str2[4]] + contains[str2[5]]
+ + contains[str2[6]] + contains[str2[7]] + contains[str2[8]]
+ + contains[str2[9]] + contains[str2[10]] + contains[str2[11]]
+ + contains[str2[12]] + contains[str2[13]] + contains[str2[14]]
+ + contains[str2[15]];
+
+ second_sum = contains[str2_r[0]] + contains[str2_r[1]] + contains[str2_r[2]]
+ + contains[str2_r[3]] + contains[str2_r[4]] + contains[str2_r[5]]
+ + contains[str2_r[6]] + contains[str2_r[7]] + contains[str2_r[8]]
+ + contains[str2_r[9]] + contains[str2_r[10]] + contains[str2_r[11]]
+ + contains[str2_r[12]] + contains[str2_r[13]] + contains[str2_r[14]]
+ + contains[str2_r[15]];
+
+ contains[str1[0]] = contains[str1[1]] = contains[str1[2]]
+ = contains[str1[3]] = contains[str1[4]] = contains[str1[5]]
+ = contains[str1[6]] = contains[str1[7]] = contains[str1[8]]
+ = contains[str1[9]] = contains[str1[10]] = contains[str1[11]]
+ = contains[str1[12]] = contains[str1[13]] = contains[str1[14]]
+ = contains[str1[15]] = 0;
+
+ // can be off up to two nibbles
+ return MAX(first_sum, second_sum) * 2 + 2;
+}
+#endif
+
+/* Exact lcs implementation */
+typedef unsigned char uchar;
+LCS(uchar, 32, 32)
+
diff --git a/runner.c b/runner.c
index 163a4ac..889fe5b 100644
--- a/runner.c
+++ b/runner.c
@@ -0,0 +1,291 @@
+#include <stdlib.h>
+#include <assert.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "tsc.c"
+#include "md5.c"
+#define MD5_DIGEST_LENGTH 16
+
+#define USE_ROUGH
+//#define TESTING
+//#define SINGLERUN
+//#define PEDANTIC
+#ifndef BENCH_VAL
+#define BENCH_VAL 8000000
+#endif
+
+/* With MSF 11, we may miss 10's, but we don't have the false positives.
+ * But we only get ~1110K/s instead of the ~1115K/s we get with MSF 8. Whahuh?
+ */
+#define MSF 8
+
+
+unsigned char tbl[512];
+md5_state_t global_md5;
+
+#if 0
+__inline__ void MD5(unsigned char *input, unsigned int input_len,
+ unsigned char *out) {
+ md5_init(&global_md5);
+ md5_append(&global_md5, (const md5_byte_t *)input, input_len);
+ md5_finish(&global_md5, out);
+}
+#endif
+
+void raw_MD5_64(unsigned char* input, unsigned char* digest) {
+ /* initialize what we need in the md5_state_t */
+ global_md5.abcd[0] = 0x67452301;
+ global_md5.abcd[1] = 0xefcdab89;
+ global_md5.abcd[2] = 0x98badcfe;
+ global_md5.abcd[3] = 0x10325476;
+
+ /* process the fixed-sized input, pre-padded with the length */
+ md5_process(&global_md5, input);
+
+ /* and finally store the digest
+ * TODO: convert straight to hex?
+ */
+ unsigned int i;
+ for (i = 0; i < 16; ++i) {
+#if BYTE_ORDER == LITTLE_ENDIAN
+ memcpy(digest, tbl + ((unsigned char*)(global_md5.abcd))[i] * 2, 2);
+#elif BYTE_ORDER == BIG_ENDIAN
+ memcpy(digest, tbl + ((unsigned char*)(global_md5.abcd))[i^0x3] * 2, 2);
+#else
+#error Unknown endianness
+#endif
+ digest += 2;
+ }
+}
+
+#ifndef TIGER
+#include "srandomdev.c"
+#endif
+
+#include "lcs.c"
+#include "lcs_sse.c"
+
+/* timing functions */
+struct timeval stopwatch_start;
+struct timeval stopwatch_stop;
+
+void start_stopwatch() {
+ gettimeofday(&stopwatch_start, NULL);
+}
+
+unsigned long stop_stopwatch() {
+ int seconds, usecs;
+ gettimeofday(&stopwatch_stop, NULL);
+ seconds = stopwatch_stop.tv_sec - stopwatch_start.tv_sec;
+ usecs = stopwatch_stop.tv_usec - stopwatch_start.tv_usec;
+ if (usecs < 0) {
+ seconds -= 1;
+ usecs += 1000000;
+ }
+ return seconds * 1000000 + usecs;
+}
+
+const char *hexchar = "0123456789abcdef";
+
+unsigned char desttohex[64];
+
+unsigned char test_input[] = {
+// 0x00, 0x04, 0x89, 0x4e, 0xf8, 0x11, 0x06, 0xbf, 0x87, 0xe3, 0x35, 0x6f, 0x68, 0xa5, 0xfa, 0xb3,
+// 0xcf, 0xa2, 0x0e, 0x2a, 0xda, 0xfc, 0x8e, 0x06, 0x66, 0x7b, 0x99, 0xa4, 0x06, 0x21, 0x95, 0x63,
+/* Eli's standard */
+0x22, 0x2b, 0x5b, 0x3c, 0xf7, 0xb9, 0xc2, 0x65, 0xc0, 0x4d, 0xab, 0x6d, 0xc6, 0xb9, 0x56, 0xeb,
+// 0xc4, 0x48, 0x13, 0xf8, 0x8e, 0xff, 0xd3, 0x03, 0x4f, 0x22, 0x0b, 0xe7, 0xe5, 0xe3, 0x35, 0xa0,
+/* One that fails with FALSE NEGATIVE2 when using non-byteswapped sse lcs */
+/* 0xee, 0xf5, 0x67, 0x7f, 0x61, 0x88, 0xcb, 0xea, 0x8f, 0x21, 0x13, 0x85, 0x1f, 0xac, 0x4d, 0xab */
+};
+
+unsigned char input_raw[MD5_DIGEST_LENGTH];
+
+unsigned char input_hex[64] __attribute__((aligned(64)));
+
+/* Writes hexadecimal form of argument to desttohex, no terminating null
+ * character.
+ */
+void tohex(unsigned char *md5val) {
+ unsigned char *c = desttohex;
+ unsigned int i;
+
+ for(i=0; i<MD5_DIGEST_LENGTH; i++) {
+ int x = md5val[i] * 2;
+ memcpy(c, tbl+x, 2);
+ c += 2;
+ }
+}
+
+void inc_input() {
+ unsigned int i=0;
+ do {
+ unsigned char val = input_raw[i] = input_raw[i] + 1;
+ memcpy(input_hex + 2*i, tbl + 2*val, 2);
+ if (val) {
+ /* if val is now 0, it overflowed, so we need to go to the next digit;
+ * but if non-zero, we didn't overflow, so we're done. */
+ break;
+ }
+ i++;
+ } while (i<MD5_DIGEST_LENGTH);
+}
+
+void print_input_hex() {
+ unsigned int x;
+ for (x=0; x<32; x++) {
+ printf("%c", input_hex[x]);
+ }
+}
+
+int main() {
+ /* setup code */
+ int x;
+ unsigned char *ct = tbl;
+ for (x=0; x<256; x++) {
+ *ct++ = hexchar[(x & 0xf0) >> 4];
+ *ct++ = hexchar[(x & 0xf)];
+ }
+ desttohex[32] = '\0';
+ for (x=0; x<64; x++) {
+ input_hex[x] = '\0';
+ }
+ input_hex[32] = 0x80; // start of md5 pad
+ input_hex[57] = 0x01; // hardcoded length of 256 bits
+
+ int n;
+ unsigned int i = 0;
+ unsigned int matches_found = 0;
+ unsigned int possible_matches = 0;
+
+ printf("%p %p %p\n", desttohex, input_hex, tbl);
+ /* Generate a random start point */
+#ifdef TESTING
+ memcpy(input_raw, test_input, MD5_DIGEST_LENGTH);
+#else
+ srandomdev();
+ for(i=0; i<16; i++) {
+ input_raw[i] = random() & 0xff;
+ }
+ i = 0;
+#endif
+ tohex(input_raw);
+ memcpy((char *)input_hex, (char *)desttohex, 32);
+ printf("Starting at ");
+ print_input_hex();
+ printf("\n");
+ fflush(stdout);
+
+#ifdef TIMINGS
+ unsigned long long t0, t1;
+#endif
+ /* Loop */
+ start_stopwatch();
+ while (1) {
+#ifdef TIMINGS
+ t0 = rdtsc();
+#endif
+ raw_MD5_64(input_hex, desttohex);
+#ifdef TIMINGS
+ t1 = rdtsc();
+ if (i % 1000000 == 0) printf("md5=%llu\n", (t1-t0-63));
+#endif
+
+#ifdef USE_PRE_LCS
+ int pre_n = lcs_upper_bound((uint32_t*)input_raw, (uint32_t*)global_md5.abcd);
+ if (pre_n >= MSF) {
+#endif
+
+#ifdef USE_ROUGH
+#ifdef TIMINGS
+ t0 = rdtsc();
+#endif
+#ifdef USE_LCS4
+ n = lcs_32_rough(input_hex, desttohex);
+#else
+ n = lcs_32_rough((uint32_t*)input_raw, (uint32_t*)global_md5.abcd);
+#endif
+#ifdef TIMINGS
+ t1 = rdtsc();
+ if (i % 1000000 == 0) printf("rough=%llu\n", (t1-t0-63));
+#endif
+ /* Based on testing, the 'rough' match may undercount by up to 6
+ * characters (for an int32-based implementation). It now returns
+ * 1/4th the length of the match it found, so it will return >0 if
+ * there is a match of 7 or greater, and sometimes return >0 if there
+ * is a match of 4 or greater.
+ */
+ if (n > 0) {
+ possible_matches++;
+#ifdef TIMINGS
+ t0 = rdtsc();
+#endif
+ n = lcs_uchar_32_32(input_hex, desttohex); /* get the exact number */
+#ifdef TIMINGS
+ t1 = rdtsc();
+#endif
+ if (n >= MSF) {
+#ifndef SINGLERUN
+#ifdef TIMINGS
+ printf("full=%llu\n", (t1-t0-63));
+#endif
+ print_input_hex();
+ printf(" %s %d\n", desttohex, n);
+ fflush(stdout);
+#endif
+ matches_found++;
+#ifdef PEDANTIC
+ } else {
+ n = lcs_uchar_32_32(input_hex, desttohex);
+ if(n >= MSF) {
+ print_input_hex();
+ printf(" %d FALSE NEGATIVE1\n", n);
+ }
+#endif
+ }
+
+#ifdef PEDANTIC
+ } else {
+ n = lcs_uchar_32_32(input_hex, desttohex);
+ if(n >= MSF) {
+ printf("%s %d FALSE NEGATIVE2\n", input_hex, n);
+ }
+#endif
+ }
+#else
+ n = lcs_32(input_hex, desttohex);
+ if (n >= MSF) {
+ print_input_hex();
+ printf(" %s %d\n", desttohex, n);
+ fflush(stdout);
+ }
+#endif
+
+#ifdef USE_PRE_LCS
+ }
+#endif
+
+ /* keep us updated on speed */
+ i++;
+ if (i == BENCH_VAL) {
+ unsigned long elapsed = stop_stopwatch();
+ printf("Benchmark: %dK/s (found %d matches and %d false matches in %d processed)\n", (int) ((float)BENCH_VAL * 1000 / elapsed), matches_found, possible_matches - matches_found, i);
+#ifdef SINGLERUN
+ printf("%d processed in %ld us\n", BENCH_VAL, elapsed);
+#endif
+ fflush(stdout);
+ /* don't include the time it takes us to print out the benchmark */
+ start_stopwatch();
+ i = 0;
+ matches_found = 0;
+ possible_matches = 0;
+#ifdef SINGLERUN
+ return 0;
+#endif
+ }
+ inc_input();
+ }
+ return 0;
+}