brute forcer

Programming, for all ages and all languages.
Post Reply
User avatar
Kevin McGuire
Member
Member
Posts: 843
Joined: Tue Nov 09, 2004 12:00 am
Location: United States
Contact:

Post by Kevin McGuire »

I made a few changes and took it from 1.5 million per second to 2.1 - 2.2 million per second on my processor, which seems like a good jump.

Code: Select all

#include <asm/processor.h>
void __attribute__((__fastcall__)) __attribute__((__always_inline__))
	 md5_hash(uint8_t *message, uint32_t mlength, uint32_t *digest)
{
	uint32_t 	*nm32;
	uint8_t	*nm8;
	uint32_t 	nmlength;
	uint32_t 	i, j;
	uint32_t 	           AA, BB, CC, DD;
	register uint32_t 	A, B, C, D;
	uint32_t 	X[16];
	uint8_t	TMP[mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64))];
	// do padding so message is a multiple of 512 bits (64 bytes)
	nmlength = mlength%64 == 0 ? mlength + 64 : mlength + (64 - (mlength%64));
	//if((mlength%64) == 0)
	//{
	//	nmlength = mlength + 64;
	//}else{
	//	nmlength = mlength + (64 - (mlength%64));
	//}
	//nm8 = (uint8_t*)malloc(nmlength);	
        prefetchw(&TMP[0]);
	nm8 = &TMP[0];
	nm32 = (uint32_t*)nm8;
	for(i = 0; i < (nmlength/4); ++i)
	{
		nm32[i] = 0;
	}
	for(i = 0; i < mlength; ++i)
	{
		nm8[i] = message[i];
	}	
	// add a bit one after message, pad remaining with zeros.
	nm8[mlength] = 0x80;
	// append length in bits to message (low word first)
	nm32[(nmlength/4) - 2] = mlength * 8;
	nm32[(nmlength/4) - 1] = 0;
	// hash padded message (nmessage)
	AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
	prefetch(&X[0]);
	prefetchw(&X[0]);
	for(i = 0; i < (nmlength/64); ++i)
	{
		// move block from arbitrary memory location to stack. (could gain speed changing this)
		for(j = 0; j < 16; ++j)
		{
			X[j] = nm32[i*64+j];
		}
		A = AA;
		B = BB;
		C = CC;
		D = DD;
		/// round one (unrolled)
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), md5_s1_3);
		/// round two (unrolled)
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), md5_s2_3);
		/// round three (unrolled)
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), md5_s3_3);
		/// round four (unrolled)
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), md5_s4_3);

		AA = AA + A;
		BB = BB + B;
		CC = CC + C;
		DD = DD + D;
	}

	// release allocated memory.
	//free(nm8);

	digest[0] = AA;
	digest[1] = BB;
	digest[2] = CC;
	digest[3] = DD;
	return;
}
GLneo
Member
Member
Posts: 237
Joined: Wed Dec 20, 2006 7:56 pm

Post by GLneo »

what is

Code: Select all

#include <asm/processor.h>
?
Ninjarider
Member
Member
Posts: 62
Joined: Fri Jun 29, 2007 8:36 pm

Post by Ninjarider »

optimizing idea. temporarly diassable interupts.
User avatar
Kevin McGuire
Member
Member
Posts: 843
Joined: Tue Nov 09, 2004 12:00 am
Location: United States
Contact:

Post by Kevin McGuire »

It generates the assembly instruction for a prefetch. If you do not have it just delete the prefetch..(..) calls from the function.

I noticed a small speed improvement when using them, but the major improvement came from placing more data on the stack, and removing memcpy and memset.
optimizing idea. temporarly diassable interupts.
Yea. You could write a small free standing program and load it with GRUB to see how much faster it would compute with out any interrupts or task switching.
User avatar
Brynet-Inc
Member
Member
Posts: 2426
Joined: Tue Oct 17, 2006 9:29 pm
Libera.chat IRC: brynet
Location: Canada
Contact:

Post by Brynet-Inc »

Those functions don't seem to be present on OpenBSD, Neither is that include...

Apparently, They are not standard... :wink:

And __attribute__((__fastcall__)) isn't in GCC 3x either..
Image
Twitter: @canadianbryan. Award by smcerm, I stole it. Original was larger.
GLneo
Member
Member
Posts: 237
Joined: Wed Dec 20, 2006 7:56 pm

Post by GLneo »

~1,450,000 h/s on my 1.4GHz

the idea of a bootable cracker is great!, I'll have to try that. I don't know how much speed you'll get though because Windows/Linux gives 99% CPU usage :P
Attachments
cocaine.zip
Current code.
(3.71 KiB) Downloaded 135 times
Ninjarider
Member
Member
Posts: 62
Joined: Fri Jun 29, 2007 8:36 pm

Post by Ninjarider »

could someone explain this to me.
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.
User avatar
Alboin
Member
Member
Posts: 1466
Joined: Thu Jan 04, 2007 3:29 pm
Location: Noricum and Pannonia

Post by Alboin »

Ninjarider wrote:could someone explain this to me.
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
what does the |, ~, >>, <<, and ^ mean? i didn't study c long enough to learn them.
They're bitwise operators. This has a lot more about them.
C8H10N4O2 | #446691 | Trust the nodes.
Ninjarider
Member
Member
Posts: 62
Joined: Fri Jun 29, 2007 8:36 pm

Post by Ninjarider »

after the refresher i need to clarify 1 of the bit operators. the ~ means to xor a {variable} by 1.
User avatar
os64dev
Member
Member
Posts: 553
Joined: Sat Jan 27, 2007 3:21 pm
Location: Best, Netherlands

Post by os64dev »

GLneo wrote:well, i tried your code and it is about half as fast a my current build, i dont care what you do with the code, i just thought what ever you were doing you could do it better with the newer code.

p.s. i really like the BOINC idea! :D
We agree on the BOINC idea but not yet on the code:

i ran your latest build and got:
cocaine 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 2.38s
Average h/s: 1364077.47 h/s
however my latest build gave
$ ./brute.exe 5b9104dc42e2f8fdd3f6a801f7656d65 5
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 1.49s
- avg. hash/s: 2181605.39 h/s
so did you test with my latest version?

PS. test was run on a centrino laptop instead of the earlier mentioned core 2

@ kevin

could you test your thread model with my md5 implementation or adapt your application like my MD5, otherwise i can do it but that takes another day (i am going to bed).
Attachments

[The extension cc has been deactivated and can no longer be displayed.]

Author of COBOS
User avatar
os64dev
Member
Member
Posts: 553
Joined: Sat Jan 27, 2007 3:21 pm
Location: Best, Netherlands

Post by os64dev »

it above your last post ;-)

edit: hmm.. a previous post got deleted :?
Author of COBOS
GLneo
Member
Member
Posts: 237
Joined: Wed Dec 20, 2006 7:56 pm

Post by GLneo »

the post that was deleted was me asking for the code ( i didn't see you posted it :P )
GLneo
Member
Member
Posts: 237
Joined: Wed Dec 20, 2006 7:56 pm

Post by GLneo »

OK I've just retested it and yours is ~1,160,000 and mine is ~1,430,000, but yours has optimizations mine doesn't and vise versa so i'm going to combine them and see what i get.
GLneo
Member
Member
Posts: 237
Joined: Wed Dec 20, 2006 7:56 pm

Post by GLneo »

OK, I've combined optimizations and I'm now getting ~1,980,000 h/s !!! the product is attached.
Attachments
cocaine.c
(8.33 KiB) Downloaded 141 times
User avatar
Brynet-Inc
Member
Member
Posts: 2426
Joined: Tue Oct 17, 2006 9:29 pm
Libera.chat IRC: brynet
Location: Canada
Contact:

Post by Brynet-Inc »

I compared both of your implementations on a Pentium 2 @ 400Mhz..

os64dev's implementation with the following compilation optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.32s and 7.40s, each of the 3 times I ran it..
Collision Found!
hash[5b9104dc42e2f8fdd3f6a801f7656d65] = 'glich'
- time: 7.32s
- avg. hash/s: 442579.79 h/s


For GLneo's implementation with the same compile optimizations:
-O3 -march=pentium2 -mmmx -ffast-math -funsafe-math-optimizations -ffinite-math-only

It had a time between 7.18s and 7.20s.. I also ran it 3 times..
Collision Found!: 5b9104dc42e2f8fdd3f6a801f7656d65 is : glich
Cracking took: 7.18s
Average h/s: 451209.48 h/s


Still pretty good, Kevin McGuire's implementation takes 26 seconds..

I'm bored now.. Off to read another topic :P
Image
Twitter: @canadianbryan. Award by smcerm, I stole it. Original was larger.
Post Reply