Page 5 of 13

Posted: Wed Jul 04, 2007 2:37 pm
by Kevin McGuire
I made a few changes and took it from 1.5 million per second to 2.1 - 2.2 million per second on my processor, which seems like a good jump.

Code: Select all

#include <asm/processor.h>
void __attribute__((__fastcall__)) __attribute__((__always_inline__))
	 md5_hash(uint8_t *message, uint32_t mlength, uint32_t *digest)
{
	uint32_t 	*nm32;
	uint8_t	*nm8;
	uint32_t 	nmlength;
	uint32_t 	i, j;
	uint32_t 	           AA, BB, CC, DD;
	register uint32_t 	A, B, C, D;
	uint32_t 	X[16];
	uint8_t	TMP[mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64))];
	// do padding so message is a multiple of 512 bits (64 bytes)
	nmlength = mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64));
	//if((mlength%64) == 0)
	//{
	//	nmlength = mlength + 64;
	//}else{
	//	nmlength = mlength + (64 - (mlength%64));
	//}
	//nm8 = (uint8_t*)malloc(nmlength);	
        prefetchw(&TMP[0]);
	nm8 = &TMP[0];
	nm32 = (uint32_t*)nm8;
	for(i = 0; i < (nmlength/4); ++i)
	{
		nm32[i] = 0;
	}
	for(i = 0; i < mlength; ++i)
	{
		nm8[i] = message[i];
	}	
	// add a bit one after message, pad remaining with zeros.
	nm8[mlength] = 0x80;
	// append length in bits to message (low word first)
	nm32[(nmlength/4) - 2] = mlength * 8;
	nm32[(nmlength/4) - 1] = 0;
	// hash padded message (nmessage)
	AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
	prefetch(&X[0]);
	prefetchw(&X[0]);
	for(i = 0; i < (nmlength/64); ++i)
	{
		// move block from arbitrary memory location to stack. (could gain speed changing this)
		for(j = 0; j < 16; ++j)
		{
			X[j] = nm32[i*64+j];
		}
		A = AA;
		B = BB;
		C = CC;
		D = DD;
		/// round one (unrolled)
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), md5_s1_3);
		/// round two (unrolled)
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), md5_s2_3);
		/// round three (unrolled)
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), md5_s3_3);
		/// round four (unrolled)
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), md5_s4_3);

		AA = AA + A;
		BB = BB + B;
		CC = CC + C;
		DD = DD + D;
	}

	// release allocated memory.
	//free(nm8);

	digest[0] = AA;
	digest[1] = BB;
	digest[2] = CC;
	digest[3] = DD;
	return;
}

Posted: Wed Jul 04, 2007 2:49 pm
by GLneo
what is

Code: Select all

#include <asm/processor.h>
?

Posted: Wed Jul 04, 2007 2:59 pm
by Ninjarider
optimizing idea. temporarly diassable interupts.

Posted: Wed Jul 04, 2007 3:00 pm
by Kevin McGuire
It generates the assembly instruction for a prefetch. If you do not have it just delete the prefetch..(..) calls from the function.

I noticed a small speed improvement when using them, but the major improvement came from placing more data on the stack, and removing memcpy and memset.
optimizing idea. temporarly diassable interupts.
Yea. You could write a small free standing program and load it with GRUB to see how much faster it would compute with out any interrupts or task switching.

Posted: Wed Jul 04, 2007 3:10 pm
by Brynet-Inc
Those functions don't seem to be present on OpenBSD, Neither is that include...

Apparently, They are not standard... :wink:

And __attribute__((__fastcall__)) isn't in GCC 3x either..

Posted: Wed Jul 04, 2007 3:24 pm
by GLneo
~1,450,000 h/s on my 1.4GHz

the idea of a bootable cracker is great!, I'll have to try that. I don't know how much speed you'll get though because Windows/Linux gives 99% CPU usage :P

Posted: Wed Jul 04, 2007 3:36 pm
by Ninjarider
could someone explain this to me.
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.

Posted: Wed Jul 04, 2007 3:39 pm
by Alboin
Ninjarider wrote:could someone explain this to me.
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.
They're bitwise operators. This has a lot more about them.

Posted: Wed Jul 04, 2007 4:02 pm
by Ninjarider
after the refresher i need to clarify 1 of the bit operators. the ~ means to xor a {variable} by 1.

Posted: Wed Jul 04, 2007 4:09 pm
by os64dev
GLneo wrote:well, i tried your code and it is about half as fast a my current build, i dont care what you do with the code, i just thought what ever you were doing you could do it better with the newer code.

p.s. i really like the BOINC idea! :D
We agree on the BOINC idea but not yet on the code:

i ran your latest build and got:
cocaine 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 2.38s
Average h/s: 1364077.47 h/s
however my latest build gave
$ ./brute.exe 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 1.49s
- avg. hash/s: 2181605.39 h/s
so did you test with my latest version?

PS. test was run on a centrino laptop instead of the earlier mentioned core 2

@ kevin

could you test your thread model with my md5 implementation or adapt your application like my MD5, otherwise i can do it but that takes another day (i am going to bed).

Posted: Wed Jul 04, 2007 4:17 pm
by os64dev
it above your last post ;-)

edit: hmm.. a previous post got deleted :?

Posted: Wed Jul 04, 2007 4:19 pm
by GLneo
the post that was deleted was me asking for the code ( i didn't see you posted it :P )

Posted: Wed Jul 04, 2007 4:24 pm
by GLneo
OK I've just retested it and yours is ~1,160,000 and mine is ~1,430,000, but yours has optimizations mine doesn't and vise versa so i'm going to combine them and see what i get.

Posted: Wed Jul 04, 2007 4:47 pm
by GLneo
OK, I've combined optimizations and I'm now getting ~1,980,000 h/s !!! the product is attached.

Posted: Wed Jul 04, 2007 4:58 pm
by Brynet-Inc
I compared both of your implementations on a Pentium 2 @ 400Mhz..

os64dev's implementation with the following compilation optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.32s and 7.40s, each of the 3 times I ran it..
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 7.32s
- avg. hash/s: 442579.79 h/s


For GLneo's implementation with the same compile optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.18s and 7.20s.. I also ran it 3 times..
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 7.18s
Average h/s: 451209.48 h/s


Still pretty good, Kevin McGuire's implementation takes 26 seconds..

I'm bored now.. Off to read another topic :P