OSDev.org

Posted: **Wed Jul 04, 2007 2:37 pm**

I made a few changes and took it from 1.5 million per second to 2.1 - 2.2 million per second on my processor, which seems like a good jump.

Code: Select all

#include <asm/processor.h>
void __attribute__((__fastcall__)) __attribute__((__always_inline__))
	 md5_hash(uint8_t *message, uint32_t mlength, uint32_t *digest)
{
	uint32_t 	*nm32;
	uint8_t	*nm8;
	uint32_t 	nmlength;
	uint32_t 	i, j;
	uint32_t 	           AA, BB, CC, DD;
	register uint32_t 	A, B, C, D;
	uint32_t 	X[16];
	uint8_t	TMP[mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64))];
	// do padding so message is a multiple of 512 bits (64 bytes)
	nmlength = mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64));
	//if((mlength%64) == 0)
	//{
	//	nmlength = mlength + 64;
	//}else{
	//	nmlength = mlength + (64 - (mlength%64));
	//}
	//nm8 = (uint8_t*)malloc(nmlength);	
        prefetchw(&TMP[0]);
	nm8 = &TMP[0];
	nm32 = (uint32_t*)nm8;
	for(i = 0; i < (nmlength/4); ++i)
	{
		nm32[i] = 0;
	}
	for(i = 0; i < mlength; ++i)
	{
		nm8[i] = message[i];
	}	
	// add a bit one after message, pad remaining with zeros.
	nm8[mlength] = 0x80;
	// append length in bits to message (low word first)
	nm32[(nmlength/4) - 2] = mlength * 8;
	nm32[(nmlength/4) - 1] = 0;
	// hash padded message (nmessage)
	AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
	prefetch(&X[0]);
	prefetchw(&X[0]);
	for(i = 0; i < (nmlength/64); ++i)
	{
		// move block from arbitrary memory location to stack. (could gain speed changing this)
		for(j = 0; j < 16; ++j)
		{
			X[j] = nm32[i*64+j];
		}
		A = AA;
		B = BB;
		C = CC;
		D = DD;
		/// round one (unrolled)
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), md5_s1_3);
		/// round two (unrolled)
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), md5_s2_3);
		/// round three (unrolled)
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), md5_s3_3);
		/// round four (unrolled)
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), md5_s4_3);

		AA = AA + A;
		BB = BB + B;
		CC = CC + C;
		DD = DD + D;
	}

	// release allocated memory.
	//free(nm8);

	digest[0] = AA;
	digest[1] = BB;
	digest[2] = CC;
	digest[3] = DD;
	return;
}

Posted: **Wed Jul 04, 2007 2:49 pm**

what is

Code: Select all

#include <asm/processor.h>

?

Posted: **Wed Jul 04, 2007 2:59 pm**

optimizing idea. temporarly diassable interupts.

Posted: **Wed Jul 04, 2007 3:00 pm**

It generates the assembly instruction for a prefetch. If you do not have it just delete the prefetch..(..) calls from the function.

I noticed a small speed improvement when using them, but the major improvement came from placing more data on the stack, and removing memcpy and memset.

optimizing idea. temporarly diassable interupts.

Yea. You could write a small free standing program and load it with GRUB to see how much faster it would compute with out any interrupts or task switching.

Posted: **Wed Jul 04, 2007 3:10 pm**

Those functions don't seem to be present on OpenBSD, Neither is that include...

Apparently, They are not standard...

And __attribute__((__fastcall__)) isn't in GCC 3x either..

Posted: **Wed Jul 04, 2007 3:24 pm**

~1,450,000 h/s on my 1.4GHz

the idea of a bootable cracker is great!, I'll have to try that. I don't know how much speed you'll get though because Windows/Linux gives 99% CPU usage

Posted: **Wed Jul 04, 2007 3:36 pm**

could someone explain this to me.

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))

what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.

Posted: **Wed Jul 04, 2007 3:39 pm**

Ninjarider wrote:could someone explain this to me.

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))

what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.

They're bitwise operators. This has a lot more about them.

Posted: **Wed Jul 04, 2007 4:02 pm**

after the refresher i need to clarify 1 of the bit operators. the ~ means to xor a {variable} by 1.

Posted: **Wed Jul 04, 2007 4:09 pm**

GLneo wrote:well, i tried your code and it is about half as fast a my current build, i dont care what you do with the code, i just thought what ever you were doing you could do it better with the newer code.

p.s. i really like the BOINC idea!

We agree on the BOINC idea but not yet on the code:

i ran your latest build and got:

cocaine 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 2.38s
Average h/s: 1364077.47 h/s

however my latest build gave

$ ./brute.exe 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 1.49s
- avg. hash/s: 2181605.39 h/s

so did you test with my latest version?

PS. test was run on a centrino laptop instead of the earlier mentioned core 2

@ kevin

could you test your thread model with my md5 implementation or adapt your application like my MD5, otherwise i can do it but that takes another day (i am going to bed).

Posted: **Wed Jul 04, 2007 4:17 pm**

it above your last post

edit: hmm.. a previous post got deleted

Posted: **Wed Jul 04, 2007 4:19 pm**

the post that was deleted was me asking for the code ( i didn't see you posted it

)

Posted: **Wed Jul 04, 2007 4:24 pm**

OK I've just retested it and yours is ~1,160,000 and mine is ~1,430,000, but yours has optimizations mine doesn't and vise versa so i'm going to combine them and see what i get.

Posted: **Wed Jul 04, 2007 4:47 pm**

OK, I've combined optimizations and I'm now getting ~1,980,000 h/s !!! the product is attached.

Posted: **Wed Jul 04, 2007 4:58 pm**

I compared both of your implementations on a Pentium 2 @ 400Mhz..

os64dev's implementation with the following compilation optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.32s and 7.40s, each of the 3 times I ran it..
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 7.32s
- avg. hash/s: 442579.79 h/s

For GLneo's implementation with the same compile optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.18s and 7.20s.. I also ran it 3 times..
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 7.18s
Average h/s: 451209.48 h/s

Still pretty good, Kevin McGuire's implementation takes 26 seconds..

I'm bored now.. Off to read another topic

OSDev.org

brute forcer