brute forcer

Kevin McGuire · Post by **Kevin McGuire** » Tue Jul 03, 2007 8:18 pm

/// generate next sequence from current sequence.
// returns zero if no more sequences exist, or one for success.
int32_t sequence_next(uint8_t *sequence, uint32_t charset_length, uint32_t length)
{
   uint32_t x;
   for(x = 0; sequence[x] == (charset_length-1); ++x)
   {
      if(x == (length-1))
      {
         return 0;
      }
      sequence[x] = 0;
   }
   ++sequence[x];
   return 1;
}

struct sSequenceBatch
{
	uint8_t	*seq_offset;
	uint8_t	*seq_base;
	uint32_t	len_offset;
	uint32_t	len_base;
	uint32_t	state;
	uint32_t	charset_length;
};


struct sSequenceBatch *g_sequence_batch;
uint32_t g_thread_stop = 0;
uint32_t g_thread_active_cnt = 0;
uint32_t g_sequence_batch_cnt;

#define CHARSET_LENGTH		100
#define SEQUENCE_LENGTH		3
#define THREAD_COUNT		5

void* threaded_worker(uint32_t arg)
{
	uint8_t *sequence = 0;
	uint32_t state, x, y;
	for(x = 0; x < g_sequence_batch_cnt; ++x)
	{
		if(g_sequence_batch[x].state == 0x0)
		{
			// you can use zero instead of arg, but I thought it would leave a trace for debugging.
			state = arg;
			asm("xchg %0, %1" : "=m" (g_sequence_batch[x].state) : "r" (state));
			if(state == arg)
			{
				/// we have aquired a lock and may work this batch.
				if(sequence != 0)
				{
					free(sequence);
				}
				sequence = (uint8_t*)malloc(g_sequence_batch[x].len_offset + g_sequence_batch[x].len_base);
				// combining the offset and base provide a complete sequence.
				memcpy(sequence, g_sequence_batch[x].seq_offset, g_sequence_batch[x].len_offset);
				memcpy(&sequence[g_sequence_batch[x].len_offset], g_sequence_batch[x].seq_base, g_sequence_batch[x].len_base);
				// we only iliterate the offset, not the base!
				do_md5_hash_check_with_sequence_before_while_loop;
				if(we_find_the_answer_stop_everything)
				{
					g_thread_stop = 1;
				}
				while(sequence_next(sequence, g_sequence_batch[x].charset_length, g_sequence_batch[x].len_offset) == 1){
					// our sequence consists of [offset][base], with the base being the higher part.
					// the 'sequence_next' function will only modify the [offset] part since we
					// only told it the length of that part, once it tries to wrap around we exit and
					// get another batch..
					/// ----- do md5 hash stuff -----
					printf("id:%x --", arg);
					do_md5_hash_check_with_sequence_during_while_loop;
					if(we_find_the_answer_stop_everything)
					{
						g_thread_stop = 1;
					}
					for(y = 0; y < g_sequence_batch[x].len_offset + g_sequence_batch[x].len_base; ++y)
					{
						printf("%02x", sequence[y]);
					}
					printf("\n");
					/// check global state, do we need to stop now?
					if(g_thread_stop != 0)
					{
						// another thread found a collision.
						asm("lock decl %0" : "=m" (g_thread_active_cnt));
						printf("g_thread_active_cnt:%x\n", g_thread_active_cnt);
						return 0;		
					}
				}
			}
		}
	}
	/// we were unable to aquire another batch.
	asm("lock decl %0" : "=m" (g_thread_active_cnt));
	printf("g_thread_active_cnt:%x\n", g_thread_active_cnt);
	return 0;
}
int main(int argc, char *argv[])
{	
	pthread_t *thread_workers;
	uint32_t x;
	/// prepare sequence batches for threads.
	uint8_t *seq_offset = (uint8_t*)malloc(SEQUENCE_LENGTH - 1);
	uint8_t *seq_base = (uint8_t*)malloc(CHARSET_LENGTH);
	memset(seq_offset, 0, SEQUENCE_LENGTH - 1);
	g_sequence_batch = (struct sSequenceBatch*)malloc(sizeof(struct sSequenceBatch) * CHARSET_LENGTH);
	// divide the entire sequence space into parts of number CHARSET_LENGTH
	g_sequence_batch_cnt = CHARSET_LENGTH;
	for(x = 0; x < CHARSET_LENGTH; ++x)
	{	
		g_sequence_batch[x].seq_offset = seq_offset;	// a starting offset (common)
		seq_base[x] = x;
		g_sequence_batch[x].seq_base = &seq_base[x];
		g_sequence_batch[x].len_offset = SEQUENCE_LENGTH - 1;
		g_sequence_batch[x].len_base = 1;
		g_sequence_batch[x].state = 0x0;
		g_sequence_batch[x].charset_length = CHARSET_LENGTH;
	}
	/// create threads
	thread_workers = (pthread_t*)malloc(sizeof(pthread_t) * (THREAD_COUNT-1));
	g_thread_active_cnt = THREAD_COUNT;
	for(x = 0; x < (THREAD_COUNT-1); ++x)
	{
		pthread_create(&thread_workers[x], NULL, (void*(*)(void*))&threaded_worker, (void*)x);
	}
	// become a worker ourselves?
	threaded_worker(0);
	// keep waiting until all have returned.
	while(g_thread_active_cnt != 0);
	for(x = 0; x < CHARSET_LENGTH; ++x)
	{
		printf("batch number %x was completed by thread number %x.\n", x, g_sequence_batch[x].state);
	}
	return 1;
}

Kevin McGuire · Post by **Kevin McGuire** » Tue Jul 03, 2007 8:39 pm

I forgot that you need to run different lengths of the sequence. I figure what you might be able to do is change the name of the main function to something such as do_for_length(x). So that it creates enough threads to handle doing a length of say 7, then all the threads finish and it starts all over but with a length 6 instead - or how ever you may go (up or down) and what ever you may start and finish with.

As for competing with software such as Cain in the number of hashes checked per second.. The reason it's per second value is higher is because they exploit the MD5 hash in ways that cause it to reveal if a message will produce the correct hash quicker during computation.

For a hunch they may perform a few addition computations during hashing that may produce certain values that are used to tell if they need to exit the hashing routine earlier. In contrast with the routine you are using which goes from start to finish when hashing a message no matter if it will produce a correct hash or not. They check hashes faster.

os64dev · Post by **os64dev** » Wed Jul 04, 2007 12:37 am

I've checked the compiler output with -O3 under gcc and it does the loop unrolling. so it's rather redundant to that. but not bad

Also about the precomputing:

Code: Select all

uint32_t __s1[] = {7, 12, 17, 22}; 
uint32_t __s2[] = {5, 9, 14, 20}; 
uint32_t __s3[] = {4, 11, 16, 23}; 
uint32_t __s4[] = {6, 10, 15, 21}; 
uint32_t __k1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 
uint32_t __k2[] = {1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12}; 
uint32_t __k3[] = {5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2}; 
uint32_t __k4[] = {0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9};

if you change the declaration to static const uint32_t then the compiler also precomputes.

Code: Select all

#include <math.h> 
#include <time.h> 
#include <stdio.h> 
#include <stdlib.h> 
#include <string.h> 

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) 
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) 
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z)))) 
#define H(x,y,z) ((x) ^ (y) ^ (z)) 
#define I(x,y,z) ((y) ^ ((x) | (~(z)))) 

void md5_genT(uint32_t *T) 
{ 
   uint_fast32_t i; 
   for(i = 0; i < 64; ++i) { 
      T[i] = (uint32_t)(4294967296.0 * fabs(sin((float)(i+1)))); 
      //printf("%08x\n", T[i]); 
      //sleep(1); 
   } 
} 

#define unroll

void md5_hash(uint8_t *message, uint_fast32_t mlength, uint32_t *T, unsigned char * digest) 
{ 
   uint8_t  nmlength; 
   static const uint32_t __s1[] = {7, 12, 17, 22}; 
   static const uint32_t __s2[] = {5, 9, 14, 20}; 
   static const uint32_t __s3[] = {4, 11, 16, 23}; 
   static const uint32_t __s4[] = {6, 10, 15, 21}; 
   static const uint32_t __k1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 
   static const uint32_t __k2[] = {1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12}; 
   static const uint32_t __k3[] = {5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2}; 
   static const uint32_t __k4[] = {0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9}; 

//    uint32_t X[16]; 
    uint32_t *X; 
   uint32_t i, j, ii, k; 
   uint32_t AA, BB, CC, DD; 
   uint32_t A, B, C, D; 
   // do padding so message is a multiple of 512 bits (64 bytes) 
   if((mlength%64) == 0) 
   { 
      nmlength = mlength + 64; 
   }else{ 
      nmlength = mlength + (64 - (mlength%64)); 
   } 
   uint8_t nm[nmlength+1];
   //nm = (uint8_t*)malloc(nmlength); 
   memcpy(nm, message, mlength); 
   memset(&nm[mlength], 0, nmlength-mlength); 
   nm[mlength] = 0x80; 
   ((uint32_t*)&nm[nmlength - 8])[0] = mlength * 8; 
   ((uint32_t*)&nm[nmlength - 8])[1] = 0; 
    
   // hash padded message (nmessage) 
   AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476; 
   for(i = 0; i < (nmlength/64); ++i) 
   { 
      // create table 
//      for(j = 0; j < 16; j++) { 
//           X[j] = *(int *)&nm[i * 64 + j * 4];
//      } 
        X = (uint32_t *)&nm[i * 64];

      A = AA; 
      B = BB; 
      C = CC; 
      D = DD; 

#ifdef unroll
      /// round one (unrolled) 
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), __s1[0]); 
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), __s1[1]); 
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), __s1[2]); 
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), __s1[3]); 
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), __s1[0]); 
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), __s1[1]); 
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), __s1[2]); 
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), __s1[3]); 
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), __s1[0]); 
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), __s1[1]); 
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), __s1[2]); 
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), __s1[3]); 
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), __s1[0]); 
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), __s1[1]); 
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), __s1[2]); 
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), __s1[3]); 
      /// round two (unrolled) 
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), __s2[0]); 
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), __s2[1]); 
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), __s2[2]); 
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), __s2[3]); 
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), __s2[0]); 
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), __s2[1]); 
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), __s2[2]); 
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), __s2[3]); 
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), __s2[0]); 
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), __s2[1]); 
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), __s2[2]); 
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), __s2[3]); 
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), __s2[0]); 
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), __s2[1]); 
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), __s2[2]); 
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), __s2[3]); 
      /// round three (unrolled) 
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), __s3[0]); 
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), __s3[1]); 
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), __s3[2]); 
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), __s3[3]); 
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), __s3[0]); 
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), __s3[1]); 
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), __s3[2]); 
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), __s3[3]); 
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), __s3[0]); 
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), __s3[1]); 
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), __s3[2]); 
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), __s3[3]); 
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), __s3[0]); 
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), __s3[1]); 
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), __s3[2]); 
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), __s3[3]); 
      /// round four (unrolled) 
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), __s4[0]); 
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), __s4[1]); 
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), __s4[2]); 
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), __s4[3]); 
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), __s4[0]); 
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), __s4[1]); 
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), __s4[2]); 
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), __s4[3]); 
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), __s4[0]); 
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), __s4[1]); 
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), __s4[2]); 
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), __s4[3]); 
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), __s4[0]); 
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), __s4[1]); 
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), __s4[2]); 
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), __s4[3]); 
#else
      ii = 0; 
      // round one 
      for(k = 0; k < 4; ++k) 
      { 
         A = B + ROTATE_LEFT((A + F(B, C, D) + X[__k1[k*4+0]] + T[ii+0]), __s1[0]); 
         D = A + ROTATE_LEFT((D + F(A, B, C) + X[__k1[k*4+1]] + T[ii+1]), __s1[1]); 
         C = D + ROTATE_LEFT((C + F(D, A, B) + X[__k1[k*4+2]] + T[ii+2]), __s1[2]); 
         B = C + ROTATE_LEFT((B + F(C, D, A) + X[__k1[k*4+3]] + T[ii+3]), __s1[3]); 
         ii += 4; 
      } 
      // round two 
      for(k = 0; k < 4; ++k) 
      { 
         A = B + ROTATE_LEFT((A + G(B, C, D) + X[__k2[k*4+0]] + T[ii+0]), __s2[0]); 
         D = A + ROTATE_LEFT((D + G(A, B, C) + X[__k2[k*4+1]] + T[ii+1]), __s2[1]); 
         C = D + ROTATE_LEFT((C + G(D, A, B) + X[__k2[k*4+2]] + T[ii+2]), __s2[2]); 
         B = C + ROTATE_LEFT((B + G(C, D, A) + X[__k2[k*4+3]] + T[ii+3]), __s2[3]); 
         ii += 4; 
      } 
      // round three 
      for(k = 0; k < 4; ++k) 
      { 
         A = B + ROTATE_LEFT((A + H(B, C, D) + X[__k3[k*4+0]] + T[ii+0]), __s3[0]); 
         D = A + ROTATE_LEFT((D + H(A, B, C) + X[__k3[k*4+1]] + T[ii+1]), __s3[1]); 
         C = D + ROTATE_LEFT((C + H(D, A, B) + X[__k3[k*4+2]] + T[ii+2]), __s3[2]); 
         B = C + ROTATE_LEFT((B + H(C, D, A) + X[__k3[k*4+3]] + T[ii+3]), __s3[3]); 
         ii += 4; 
      } 
      // round four 
      for(k = 0; k < 4; ++k) 
      { 
         A = B + ROTATE_LEFT((A + I(B, C, D) + X[__k4[k*4+0]] + T[ii+0]), __s4[0]); 
         D = A + ROTATE_LEFT((D + I(A, B, C) + X[__k4[k*4+1]] + T[ii+1]), __s4[1]); 
         C = D + ROTATE_LEFT((C + I(D, A, B) + X[__k4[k*4+2]] + T[ii+2]), __s4[2]); 
         B = C + ROTATE_LEFT((B + I(C, D, A) + X[__k4[k*4+3]] + T[ii+3]), __s4[3]); 
         ii += 4; 
      } 
#endif
      AA = AA + A; 
      BB = BB + B; 
      CC = CC + C; 
      DD = DD + D; 
   } 
   *((int*)(&digest[ 0])) = AA;
   *((int*)(&digest[ 4])) = BB;
   *((int*)(&digest[ 8])) = CC;
   *((int*)(&digest[12])) = DD;
   return; 
} 

int do_next_sequence(char *sequence, int charset_length, int length); 

int main(int argc, char *argv[]) 
{ 
    int x, len, t; 
    clock_t start,end; 
    float dif; 
    int loops = 0; 
    char *sequence; 
    char *message; 
    char temp[2]; 
    char charset[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',  
                        'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 
                         'w', 'x', 'y', 'z'};  // lower_alpha 
    
   uint32_t T[64]; 
   uint32_t Ti[64]; 
        md5_genT(&Ti[0]); 
unsigned char digest[16];
   unsigned char raw_inhash[16]; 
    
   if( argc < 3 ) 
        return 1; 
    
    len = atoi( argv[2] ); 
    sequence = (char *)malloc( len ); 
    message = (char *)malloc( len ); 
    memset(sequence, 0, len); 
    
    for(t = 0; t < 16; t++) 
    { 
        sscanf(&argv[1][t*2], "%2x", &raw_inhash[t]); 
    } 
    
    start = clock(); 
    while( do_next_sequence(sequence, sizeof(charset), len) ) 
    { 
        for(x = len-1; x >= 0; --x) { 
             message[x] = charset[sequence[x]]; 
        } 
        message[len] = 0;
        //   START md5 string: input message 
        memcpy(&T[0], &Ti[0], 256);
        md5_hash((uint8_t*)message, len, &T[0], digest); 
        //   END md5 string: output digest 
        
        loops++; 
        if(*(int *)raw_inhash == *(int *)digest) {
            if( memcmp( raw_inhash, digest, 16 ) == 0 ) 
            { 
                end = clock(); 
                dif = (end - start); 
                dif /= CLOCKS_PER_SEC; 
                printf("Collision Found!: %s is : %s\n  Cracking took: %.2lfs\n  Average h/s: %.2f h/s\n", argv[1], message, dif, (loops / dif)); 
                return 0; 
            }    
        }
    } 
    puts("No collision Found\n"); 
    return 1; 
} 

int do_next_sequence(char *sequence, int charset_length, int length) 
{ 
   int x; 
   for(x = 0; sequence[x] == (charset_length-1); ++x) 
   { 
      if(x == (length-1)) 
      { 
         return 0; 
      } 
      sequence[x] = 0; 
   } 
   ++sequence[x]; 
   return 1; 
}

gives on my 2.13 GHz without loop unroll defined

$ g++ brute.cc -o brute.exe -O3 -mno-cygwin
$ ./brute e80b5017098950fc58aad83c8c14978e 6
Collision Found!: e80b5017098950fc58aad83c8c14978e is : abcdef
Cracking took: 25.64s
Average h/s: 2390362.32 h/s

and with loop unroll defined

$ ./brute e80b5017098950fc58aad83c8c14978e 6
Collision Found!: e80b5017098950fc58aad83c8c14978e is : abcdef
Cracking took: 21.81s
Average h/s: 2809870.25 h/s

we are getting there still looking for optimisations

os64dev · Post by **os64dev** » Wed Jul 04, 2007 2:25 am

ok, i have reached my peek now. Well performance might be increased through threading if you have more then one core ore processor but for single cpu this is fast. Note this is on a intel core 2 duo of 2.13 GHz with only one core running.

$ ./brute e80b5017098950fc58aad83c8c14978e 6
Collision Found!: e80b5017098950fc58aad83c8c14978e is : abcdef
Cracking took: 18.66s
Average h/s: 3285386.76 h/s

compile with: g++ brute.cc -o brute.exe -O3 -mno-cygwin -Wall

Brynet-Inc · Post by **Brynet-Inc** » Wed Jul 04, 2007 5:11 am

This is pretty impressive, It can "crack" simplistic hashes on a Pentium 2 @ 400 Mhz in a couple of seconds.

So what licence is this code under btw? lol

GLneo · Post by **GLneo** » Wed Jul 04, 2007 8:36 am

hmm... you seem to still be working with dated version that i posted ( i need CVS

) so I've included the current build.

as for license hmm... maybe I'll BSD it for now, but i might PD it later. i do like wacky licenses so maybe the pet a cat license or beer license

p.s. there is something weird happening on my computer, it says digest is an unused variable ( and it is ) but if i remove it it fails to crack the md5's ??????????

Kevin McGuire · Post by **Kevin McGuire** » Wed Jul 04, 2007 11:09 am

I put the threading together with the collision detection for hashes. I tested it a little and it seemed to work fine with one or twenty-six threads. You can find it here (since it is rather large) with syntax highlighting.
http://kmcguire.jouleos.galekus.com/dok ... _threading

Brynet-Inc · Post by **Brynet-Inc** » Wed Jul 04, 2007 11:38 am

Wouldn't this handle more possible hashes?

Code: Select all

char charset[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
                  'a', 'A', 'b', 'B', 'c', 'C', 'd', 'D', 'e', 'E',
                  'f', 'F', 'g', 'G', 'h', 'H', 'i', 'I', 'j', 'J',
                  'k', 'K', 'l', 'L', 'm', 'M', 'n', 'N', 'o', 'O',
                  'p', 'P', 'q', 'Q', 'r', 'R', 's', 'S', 't', 'T',
                  'u', 'U', 'v', 'V', 'w', 'W', 'x', 'X', 'y', 'Y',
                  'z', 'Z'};

With this extended "charset", cracking a 4 letter hash (without numbers or caps) takes around 61.17s on a P2 at 400Mhz..

Recompiling with -Os and -O3 lowered that to 45.22s

Kevin McGuire · Post by **Kevin McGuire** » Wed Jul 04, 2007 11:48 am

Yes. You could also do complete bytes.

sequence_next(sequence, 256, length);
Then do not use a character map to translate.
message[x] = charmap[sequence[x]];

It will generate every possible combination of length length, since the MD5 algorithm can work more than just alphanumerical.

GLneo · Post by **GLneo** » Wed Jul 04, 2007 11:49 am

that batch processing is very interesting, but wouldn't it be easier to just give each thread a range: a - zzz, aaaa - zzzzz, aaaaaa - zzzzzzz? to compute?

Kevin McGuire · Post by **Kevin McGuire** » Wed Jul 04, 2007 12:03 pm

I did.

I was going to do something more advanced, but realized there was a easier way to do it - and code it.

The batches are created like this:

You want to generate all possible combination for a length of say six.
123456
xxxxxx

With a character map of abc

When building batches you just create batches like:
axx
bxx
cxx

Where xx is passed into the sequence_next. Here in the code I copy the base (which is set for the batch) and the offset (which the thread generates all possible combinations for).
// combining the offset and base provide a complete sequence.
memcpy(sequence, g_sequence_batch[x].seq_offset,g_sequence_batch[x].len_offset);
memcpy(&sequence[g_sequence_batch[x].len_offset], g_sequence_batch[x].seq_base, g_sequence_batch[x].len_base);

Then when calling sequence_next I do:
sequence_next(sequence, g_sequence_batch[x].charset_length, g_sequence_batch[x].len_offset)

Since .len_offset might have been 5, and .len_base might have been 1. This effectively creates ranges when using a length of 6 and a character map of abc.

aaaaaa - acccccc
baaaaa - bcccccc
caaaaa - ccccccc

The sequences are actually reversed so that: acccccc is really ccccca. This is why I can do:
sequence_next(sequence, g_sequence_batch[x].charset_length, g_sequence_batch[x].len_offset)

Since g_sequence_batch[x].len_offset, when using a length of 6, is the number 5.
xxxxx
cccccca

Only the part with the xs above it is processed by sequence_next, the last character a is ignored because of g_sequence_batch[x].len_offset being 5 and the entire sequence of ccccca being 6 in length.

Once sequence_next returns a zero we know we have exhausted this range and we look for another batch - we as in the thread.

I think this would be using a natural boundary instead of a arbitrary one, which might require keeping track of the count of sequence advancements , or the last sequence and doing a compare to check this. So we prolly gain a little extra performance by keeping it pretty simple like I have done when splitting them into batches.

I imagine the entire batch array could be gotton rid of and instead just use a global variable that holds the current charmap position. So that the threads lock this, increment and use the value, then go back to computing. I do not think this would cause a big problem unless we had thousands of threads running..

GLneo · Post by **GLneo** » Wed Jul 04, 2007 12:22 pm

thats very intresting, i like "global variable that holds the current charmap position" idea.

p.s. does any one have testing results for the newer code i posted?

Ninjarider · Post by **Ninjarider** » Wed Jul 04, 2007 1:16 pm

could someone either comment the code commpletely or let me know what all the special characters mean.

got an idea to make it faster. have to double check, but if what i think is right we can make it 4 times faster per thread.

os64dev · Post by **os64dev** » Wed Jul 04, 2007 1:18 pm

hmm... you seem to still be working with dated version that i posted ( i need CVS ) so I've included the current build.

Man i have posted the code to prove that it can be done faster then 1 million per second and boosting it to 3.3 million per second is proof enough. If you wish to integrate it in your code branch be my guest, but i am not going to do it for you.

i am interested in how my code would work in the multithreaded version from Kevin. The number of threads though confuse me because in general you would need 1 thread per cpu or per core anything more will just add overhead with context switching. If the outcome is faster it would just mean that a workset was closer to the collision.

so on a dual core the max. should be around 6.5 million per second and a quad core roughly 12 million. assuming my cpu speed and full cpu availabilty.

interesting is also the idea of using this in a BOINC like environment.

i did profile the code and look at the assembly output. The MD5 function is as fast as it can get, unless it can be done with an other algorithm, and is roughly responsible for 98 % of the time. so improving the speed in a major way can only be done by another MD5 hash function.

i think it can achieve higher speeds at 64 bit code because the A, B, C, D or digest can be held in registers, removing the memory mov instructions. but some one should test that because my harddisk crashed on my 64 machine.

GLneo · Post by **GLneo** » Wed Jul 04, 2007 1:58 pm

well, i tried your code and it is about half as fast a my current build, i dont care what you do with the code, i just thought what ever you were doing you could do it better with the newer code.

p.s. i really like the BOINC idea!