diff options
author | Eric Anderson <ejona86@gmail.com> | 2010-12-26 08:47:02 -0600 |
---|---|---|
committer | Eric Anderson <ejona86@gmail.com> | 2010-12-26 08:47:02 -0600 |
commit | fbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7 (patch) | |
tree | 23ced470bd9f9b4357429ba5b1a134337a2f6d1d | |
parent | b20c3c2c8468cacc6b0b73b0c15cf1cbb7ced23a (diff) | |
download | md5game-fbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7.tar.gz md5game-fbe623251cc4c8ebcab2cd7c6a4e8e2c9a5f6ed7.zip |
Add lcs_upper_bound check before lcs_32_rough for added speed
-rw-r--r-- | lcs.c | 217 | ||||
-rw-r--r-- | runner.c | 291 |
2 files changed, 508 insertions, 0 deletions
@@ -0,0 +1,217 @@ +/* From wikibooks, longest common substring with fixed size = 32. + Don't think curr and prev need to be ints. + */ +#include <stdlib.h> +#include <string.h> + + +unsigned char tbuf[32] __attribute__((aligned(32))); +#if 0 +unsigned char tbuf2[32] __attribute__((aligned(32))); +#endif + +/* TYPE must be a single word; use typedef to make it so */ +typedef unsigned int uint32; +/* LENGTH1 must be >= LENGTH2 */ + +#define LCS_MEMDEF(LENGTH) \ +unsigned char counts##LENGTH##_1[LENGTH] __attribute__((aligned(32))); \ +unsigned char counts##LENGTH##_2[LENGTH] __attribute__((aligned(32))); + +#define LCS(TYPE, LENGTH1, LENGTH2) \ +int lcs_##TYPE##_##LENGTH1##_##LENGTH2(TYPE *str1, TYPE *str2) { \ + unsigned char *curr = counts##LENGTH1##_1; \ + unsigned char *prev = counts##LENGTH1##_2; \ + unsigned char *t = NULL; \ + unsigned char maxSubstr = 0; \ + \ + unsigned char i, j; \ + /* i index into str1, j index into str2 */ \ + /* i=0 unrolled from the main loop */ \ + TYPE c1 = str1[0]; \ + for(j = 0; j<LENGTH2/sizeof(TYPE); ++j) \ + { \ + if (c1 == str2[j]){ \ + prev[j] = 1; \ + maxSubstr = 1; \ + } else { \ + prev[j] = 0; \ + } \ + } \ + for(i = 1; i<LENGTH1/sizeof(TYPE); ++i) \ + { \ + TYPE c = str1[i]; \ + if(c == str2[0]){ /* j=0 unrolled from loop */ \ + curr[0] = 1; \ + if (!maxSubstr) { \ + maxSubstr = 1; \ + } \ + } else { \ + curr[0] = 0; \ + } \ + for(j = 1; j<LENGTH2/sizeof(TYPE); ++j) \ + { \ + if(c != str2[j]) \ + { \ + curr[j] = 0; \ + } \ + else \ + { \ + curr[j] = 1 + prev[j-1]; \ + /* The next if can be replaced with: */ \ + /* maxSubstr = max(maxSubstr, table[i][j]); */ \ + /* (You need algorithm.h library for using max()) */ \ + if(maxSubstr < curr[j]) \ + { \ + maxSubstr = curr[j]; \ + } \ + } \ + } \ + t = prev; \ + prev = curr; \ + curr = t; \ + } \ + return maxSubstr; \ +} + +/* simplistic MAX macro, easily misused */ +#define MAX(x,y) ((x>y)?x:y) + +LCS_MEMDEF(32) + +/* Select one of the rough lcs implementations below */ + +#if 0 // Yields similar perf to the unsigned int version, but has a much larger error +#define LCS_ERROR 14 +typedef unsigned long long ull; +LCS(ull, 32, 32) +LCS(ull, 32, 24) +int lcs_32_rough(unsigned char *str1, unsigned char *str2) { + int max_length = 0; + int max_length_next = 0; + int offset; + max_length = lcs_ull_32_32((unsigned long long*)str1, (unsigned long long*)str2); + for (offset=1; offset < sizeof(unsigned long long); offset++) { + memcpy(tbuf, str2+offset, 24); + max_length_next = lcs_ull_32_24((unsigned long long*)str1, (unsigned long long*)tbuf); + max_length = MAX(max_length, max_length_next); + } + return max_length; +} +#endif + +#if 1 // Yields 1120K/s +#define LCS_ERROR 6 +LCS(uint32, 32, 32) +LCS(uint32, 32, 28) + + +#ifdef USE_LCS4 +int lcs_32_rough(unsigned char *str1, unsigned char *str2) { + int max_length = 0; + int max_length_next = 0; + int offset; + max_length = lcs_uint32_32_32((unsigned int*)str1, (unsigned int*)str2); + for (offset=1; offset < sizeof(unsigned int); offset++) { + memcpy(tbuf, str2+offset, 28); + max_length_next = lcs_uint32_32_28((unsigned int*)str1, (unsigned int*)tbuf); + max_length = MAX(max_length, max_length_next); + } + return max_length; +} +#endif + +#endif + +#if 0 // 645K/s w/error=1, 859K/s w/error=2 +typedef unsigned short ushort; +LCS_MEMDEF(30) +LCS(ushort, 32, 32) +LCS(ushort, 32, 30) +LCS(ushort, 30, 30) + +/* May under-count by 2 */ +int lcs_32_rough(unsigned char *str1, unsigned char *str2) { + int max_length = 0; + int max_length_next = 0; + int offset; + max_length = lcs_ushort_32_32((unsigned short*)str1, (unsigned short*)str2); + + memcpy(tbuf, str2+1, 30); + max_length_next = lcs_ushort_32_30((unsigned short*)str1, (unsigned short*)tbuf); + max_length = MAX(max_length, max_length_next); + +#if 0 /* Removes the off-by-two errors, yields Benchmark: 645/s */ +#define LCS_ERROR 1 + memcpy(tbuf2, str1+1, 30); + max_length_next = lcs_ushort_32_30((unsigned short*)str2, (unsigned short*)tbuf2); + max_length = MAX(max_length, max_length_next); + + max_length_next = lcs_ushort_32_30((unsigned short*)tbuf, (unsigned short*)tbuf2); + max_length = MAX(max_length, max_length_next); +#else /* yields Benchmark: 859K/s */ +#define LCS_ERROR 2 +#endif + + return max_length; +} +#endif + +#ifdef USE_PRE_LCS +#include "arpa/inet.h" +int lcs_upper_bound(const uint32_t * const str1_int, const uint32_t * const str2_int) { + static char contains[256] = {0}; + uint32_t bo_str2_int[4]; + uint32_t shifted_str2_int[4]; + const unsigned char * const str1 = (const unsigned char * const)str1_int; + const unsigned char * const str2 = (const unsigned char * const)str2_int; + const unsigned char * const str2_r = (const unsigned char * const)shifted_str2_int; + int first_sum, second_sum; + + bo_str2_int[0] = ntohl(str2_int[0]); + bo_str2_int[1] = ntohl(str2_int[1]); + bo_str2_int[2] = ntohl(str2_int[2]); + bo_str2_int[3] = ntohl(str2_int[3]); + + shifted_str2_int[0] = (bo_str2_int[0] << 4) | (bo_str2_int[1] >> 28); + shifted_str2_int[1] = (bo_str2_int[1] << 4) | (bo_str2_int[2] >> 28); + shifted_str2_int[2] = (bo_str2_int[2] << 4) | (bo_str2_int[3] >> 28); + shifted_str2_int[3] = (bo_str2_int[3] << 4) | (bo_str2_int[0] >> 28); + + contains[str1[0]] = contains[str1[1]] = contains[str1[2]] + = contains[str1[3]] = contains[str1[4]] = contains[str1[5]] + = contains[str1[6]] = contains[str1[7]] = contains[str1[8]] + = contains[str1[9]] = contains[str1[10]] = contains[str1[11]] + = contains[str1[12]] = contains[str1[13]] = contains[str1[14]] + = contains[str1[15]] = 1; + + first_sum = contains[str2[0]] + contains[str2[1]] + contains[str2[2]] + + contains[str2[3]] + contains[str2[4]] + contains[str2[5]] + + contains[str2[6]] + contains[str2[7]] + contains[str2[8]] + + contains[str2[9]] + contains[str2[10]] + contains[str2[11]] + + contains[str2[12]] + contains[str2[13]] + contains[str2[14]] + + contains[str2[15]]; + + second_sum = contains[str2_r[0]] + contains[str2_r[1]] + contains[str2_r[2]] + + contains[str2_r[3]] + contains[str2_r[4]] + contains[str2_r[5]] + + contains[str2_r[6]] + contains[str2_r[7]] + contains[str2_r[8]] + + contains[str2_r[9]] + contains[str2_r[10]] + contains[str2_r[11]] + + contains[str2_r[12]] + contains[str2_r[13]] + contains[str2_r[14]] + + contains[str2_r[15]]; + + contains[str1[0]] = contains[str1[1]] = contains[str1[2]] + = contains[str1[3]] = contains[str1[4]] = contains[str1[5]] + = contains[str1[6]] = contains[str1[7]] = contains[str1[8]] + = contains[str1[9]] = contains[str1[10]] = contains[str1[11]] + = contains[str1[12]] = contains[str1[13]] = contains[str1[14]] + = contains[str1[15]] = 0; + + // can be off up to two nibbles + return MAX(first_sum, second_sum) * 2 + 2; +} +#endif + +/* Exact lcs implementation */ +typedef unsigned char uchar; +LCS(uchar, 32, 32) + @@ -0,0 +1,291 @@ +#include <stdlib.h> +#include <assert.h> +#include <stdio.h> +#include <stdint.h> +#include <string.h> + +#include "tsc.c" +#include "md5.c" +#define MD5_DIGEST_LENGTH 16 + +#define USE_ROUGH +//#define TESTING +//#define SINGLERUN +//#define PEDANTIC +#ifndef BENCH_VAL +#define BENCH_VAL 8000000 +#endif + +/* With MSF 11, we may miss 10's, but we don't have the false positives. + * But we only get ~1110K/s instead of the ~1115K/s we get with MSF 8. Whahuh? + */ +#define MSF 8 + + +unsigned char tbl[512]; +md5_state_t global_md5; + +#if 0 +__inline__ void MD5(unsigned char *input, unsigned int input_len, + unsigned char *out) { + md5_init(&global_md5); + md5_append(&global_md5, (const md5_byte_t *)input, input_len); + md5_finish(&global_md5, out); +} +#endif + +void raw_MD5_64(unsigned char* input, unsigned char* digest) { + /* initialize what we need in the md5_state_t */ + global_md5.abcd[0] = 0x67452301; + global_md5.abcd[1] = 0xefcdab89; + global_md5.abcd[2] = 0x98badcfe; + global_md5.abcd[3] = 0x10325476; + + /* process the fixed-sized input, pre-padded with the length */ + md5_process(&global_md5, input); + + /* and finally store the digest + * TODO: convert straight to hex? + */ + unsigned int i; + for (i = 0; i < 16; ++i) { +#if BYTE_ORDER == LITTLE_ENDIAN + memcpy(digest, tbl + ((unsigned char*)(global_md5.abcd))[i] * 2, 2); +#elif BYTE_ORDER == BIG_ENDIAN + memcpy(digest, tbl + ((unsigned char*)(global_md5.abcd))[i^0x3] * 2, 2); +#else +#error Unknown endianness +#endif + digest += 2; + } +} + +#ifndef TIGER +#include "srandomdev.c" +#endif + +#include "lcs.c" +#include "lcs_sse.c" + +/* timing functions */ +struct timeval stopwatch_start; +struct timeval stopwatch_stop; + +void start_stopwatch() { + gettimeofday(&stopwatch_start, NULL); +} + +unsigned long stop_stopwatch() { + int seconds, usecs; + gettimeofday(&stopwatch_stop, NULL); + seconds = stopwatch_stop.tv_sec - stopwatch_start.tv_sec; + usecs = stopwatch_stop.tv_usec - stopwatch_start.tv_usec; + if (usecs < 0) { + seconds -= 1; + usecs += 1000000; + } + return seconds * 1000000 + usecs; +} + +const char *hexchar = "0123456789abcdef"; + +unsigned char desttohex[64]; + +unsigned char test_input[] = { +// 0x00, 0x04, 0x89, 0x4e, 0xf8, 0x11, 0x06, 0xbf, 0x87, 0xe3, 0x35, 0x6f, 0x68, 0xa5, 0xfa, 0xb3, +// 0xcf, 0xa2, 0x0e, 0x2a, 0xda, 0xfc, 0x8e, 0x06, 0x66, 0x7b, 0x99, 0xa4, 0x06, 0x21, 0x95, 0x63, +/* Eli's standard */ +0x22, 0x2b, 0x5b, 0x3c, 0xf7, 0xb9, 0xc2, 0x65, 0xc0, 0x4d, 0xab, 0x6d, 0xc6, 0xb9, 0x56, 0xeb, +// 0xc4, 0x48, 0x13, 0xf8, 0x8e, 0xff, 0xd3, 0x03, 0x4f, 0x22, 0x0b, 0xe7, 0xe5, 0xe3, 0x35, 0xa0, +/* One that fails with FALSE NEGATIVE2 when using non-byteswapped sse lcs */ +/* 0xee, 0xf5, 0x67, 0x7f, 0x61, 0x88, 0xcb, 0xea, 0x8f, 0x21, 0x13, 0x85, 0x1f, 0xac, 0x4d, 0xab */ +}; + +unsigned char input_raw[MD5_DIGEST_LENGTH]; + +unsigned char input_hex[64] __attribute__((aligned(64))); + +/* Writes hexadecimal form of argument to desttohex, no terminating null + * character. + */ +void tohex(unsigned char *md5val) { + unsigned char *c = desttohex; + unsigned int i; + + for(i=0; i<MD5_DIGEST_LENGTH; i++) { + int x = md5val[i] * 2; + memcpy(c, tbl+x, 2); + c += 2; + } +} + +void inc_input() { + unsigned int i=0; + do { + unsigned char val = input_raw[i] = input_raw[i] + 1; + memcpy(input_hex + 2*i, tbl + 2*val, 2); + if (val) { + /* if val is now 0, it overflowed, so we need to go to the next digit; + * but if non-zero, we didn't overflow, so we're done. */ + break; + } + i++; + } while (i<MD5_DIGEST_LENGTH); +} + +void print_input_hex() { + unsigned int x; + for (x=0; x<32; x++) { + printf("%c", input_hex[x]); + } +} + +int main() { + /* setup code */ + int x; + unsigned char *ct = tbl; + for (x=0; x<256; x++) { + *ct++ = hexchar[(x & 0xf0) >> 4]; + *ct++ = hexchar[(x & 0xf)]; + } + desttohex[32] = '\0'; + for (x=0; x<64; x++) { + input_hex[x] = '\0'; + } + input_hex[32] = 0x80; // start of md5 pad + input_hex[57] = 0x01; // hardcoded length of 256 bits + + int n; + unsigned int i = 0; + unsigned int matches_found = 0; + unsigned int possible_matches = 0; + + printf("%p %p %p\n", desttohex, input_hex, tbl); + /* Generate a random start point */ +#ifdef TESTING + memcpy(input_raw, test_input, MD5_DIGEST_LENGTH); +#else + srandomdev(); + for(i=0; i<16; i++) { + input_raw[i] = random() & 0xff; + } + i = 0; +#endif + tohex(input_raw); + memcpy((char *)input_hex, (char *)desttohex, 32); + printf("Starting at "); + print_input_hex(); + printf("\n"); + fflush(stdout); + +#ifdef TIMINGS + unsigned long long t0, t1; +#endif + /* Loop */ + start_stopwatch(); + while (1) { +#ifdef TIMINGS + t0 = rdtsc(); +#endif + raw_MD5_64(input_hex, desttohex); +#ifdef TIMINGS + t1 = rdtsc(); + if (i % 1000000 == 0) printf("md5=%llu\n", (t1-t0-63)); +#endif + +#ifdef USE_PRE_LCS + int pre_n = lcs_upper_bound((uint32_t*)input_raw, (uint32_t*)global_md5.abcd); + if (pre_n >= MSF) { +#endif + +#ifdef USE_ROUGH +#ifdef TIMINGS + t0 = rdtsc(); +#endif +#ifdef USE_LCS4 + n = lcs_32_rough(input_hex, desttohex); +#else + n = lcs_32_rough((uint32_t*)input_raw, (uint32_t*)global_md5.abcd); +#endif +#ifdef TIMINGS + t1 = rdtsc(); + if (i % 1000000 == 0) printf("rough=%llu\n", (t1-t0-63)); +#endif + /* Based on testing, the 'rough' match may undercount by up to 6 + * characters (for an int32-based implementation). It now returns + * 1/4th the length of the match it found, so it will return >0 if + * there is a match of 7 or greater, and sometimes return >0 if there + * is a match of 4 or greater. + */ + if (n > 0) { + possible_matches++; +#ifdef TIMINGS + t0 = rdtsc(); +#endif + n = lcs_uchar_32_32(input_hex, desttohex); /* get the exact number */ +#ifdef TIMINGS + t1 = rdtsc(); +#endif + if (n >= MSF) { +#ifndef SINGLERUN +#ifdef TIMINGS + printf("full=%llu\n", (t1-t0-63)); +#endif + print_input_hex(); + printf(" %s %d\n", desttohex, n); + fflush(stdout); +#endif + matches_found++; +#ifdef PEDANTIC + } else { + n = lcs_uchar_32_32(input_hex, desttohex); + if(n >= MSF) { + print_input_hex(); + printf(" %d FALSE NEGATIVE1\n", n); + } +#endif + } + +#ifdef PEDANTIC + } else { + n = lcs_uchar_32_32(input_hex, desttohex); + if(n >= MSF) { + printf("%s %d FALSE NEGATIVE2\n", input_hex, n); + } +#endif + } +#else + n = lcs_32(input_hex, desttohex); + if (n >= MSF) { + print_input_hex(); + printf(" %s %d\n", desttohex, n); + fflush(stdout); + } +#endif + +#ifdef USE_PRE_LCS + } +#endif + + /* keep us updated on speed */ + i++; + if (i == BENCH_VAL) { + unsigned long elapsed = stop_stopwatch(); + printf("Benchmark: %dK/s (found %d matches and %d false matches in %d processed)\n", (int) ((float)BENCH_VAL * 1000 / elapsed), matches_found, possible_matches - matches_found, i); +#ifdef SINGLERUN + printf("%d processed in %ld us\n", BENCH_VAL, elapsed); +#endif + fflush(stdout); + /* don't include the time it takes us to print out the benchmark */ + start_stopwatch(); + i = 0; + matches_found = 0; + possible_matches = 0; +#ifdef SINGLERUN + return 0; +#endif + } + inc_input(); + } + return 0; +} |