OSDev.org

Posted: **Fri Jul 06, 2007 4:09 pm**

I got it to h/s:1165755.132529, the barrel rolls were holding it back. ..

[edit]
Oh. I am sorry I mean 4663020.530116 hashes per second, but that is still right along with my thinking of maybe reaching twice the speed with rigorous optimizing..

Code: Select all

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
#define md5_s1_0 7
#define md5_s1_1 12
#define md5_s1_2 17
#define md5_s1_3 22
#define md5_s2_0 5
#define md5_s2_1 9 
#define md5_s2_2 14
#define md5_s2_3 20
#define md5_s3_0 4
#define md5_s3_1 11
#define md5_s3_2 16
#define md5_s3_3 23
#define md5_s4_0 6
#define md5_s4_1 10
#define md5_s4_2 15
#define md5_s4_3 21

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
#define MD5_OP4(a,b,c,d,e,f,g,h,i,j) \
	for(q = 0; q < 4; ++q) \
	{ \
		a[q] = c[q] + d(e[q], f[q], g[q]) + X[(h)*4+q] + i; \
		 \
	} \
	for(q = 0; q < 4; ++q) \
	{ \
		e[q] = a[q] << j; \
	} \
	for(q = 0; q < 4; ++q) \
	{ \
		f[q] = a[q] >> (32 - j); \
	} \
	for(q = 0; q < 4; ++q) \
	{ \
		a[q] = e[q] | f[q]; \
	} \
	for(q = 0; q < 4; ++q) \
	{ \
		a[q] = a[q] + b[q]; \
	} 

uint32_t md5_hash4(uint32_t *X, uint32_t *digest)
{
	uint32_t 	__attribute__((aligned(128))) a[4],
			__attribute__((aligned(128))) b[4],
			__attribute__((aligned(128))) c[4],
			__attribute__((aligned(128))) d[4],
			__attribute__((aligned(128))) e[4],
			__attribute__((aligned(128))) f[4];
	uint32_t x, q;
	for(x = 0; x < 4; ++x)
	{
		a[x] = 0x67452301; 
		b[x] = 0xefcdab89; 
		c[x] = 0x98badcfe; 
		d[x] = 0x10325476;	
	}
	MD5_OP4(a, b, a, F, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, F, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, F, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, F, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, F, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, F, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, F, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, F, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, F, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, F, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, F, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, F, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, F, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, F, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, F, d, a, b, 0, 0xe8c7b56, md5_s1_2);
	MD5_OP4(b, c, b, F, c, d, a, 0, 0xe8c7b756, md5_s1_3);

	MD5_OP4(a, b, a, G, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, G, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, G, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, G, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, G, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, G, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, G, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, G, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, G, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, G, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, G, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, G, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, G, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, G, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, G, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, G, c, d, a, 0, 0xe8c7b756, md5_s1_3);

	MD5_OP4(a, b, a, H, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, H, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, H, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, H, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, H, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, H, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, H, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, H, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, H, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, H, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, H, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, H, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, H, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, H, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, H, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, H, c, d, a, 0, 0xe8c7b756, md5_s1_3);

	MD5_OP4(a, b, a, I, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, I, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, I, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, I, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, I, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, I, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, I, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, I, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, I, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, I, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, I, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, I, c, d, a, 0, 0xe8c7b756, md5_s1_3);
	MD5_OP4(a, b, a, I, b, c, d, 0, 0xd76aa478, md5_s1_0);
	MD5_OP4(d, a, d, I, a, b, c, 0, 0xe8c7b756, md5_s1_1);
	MD5_OP4(c, d, c, I, d, a, b, 0, 0xe8c7b756, md5_s1_2);
	MD5_OP4(b, c, b, I, c, d, a, 0, 0xe8c7b756, md5_s1_3);

	for(x = 0; x < 4; ++x)
	{
		a[x] += 0x67452301;
		b[x] += 0xefcdab89; 
		c[x] += 0x98badcfe; 
		d[x] += 0x10325476;
	}

	for(x = 0; x < 4; ++x)
	{
		if( (digest[0] == a[x]) && (digest[1] == b[x]) && (digest[2] == c[x]) && (digest[3] == d[x]) )
		{
			return x+1;
		}
	}
	return a[0];
}

Posted: **Fri Jul 06, 2007 4:13 pm**

great work with the SSE version. Do you see any posibilities to pump it up by a factor 4, 5 ? What about SSE to CPU stalls, etc. SSE is fun but a whole new range of problems show their face. nonetheless congrats

edit: how fast di my version ran on your system?

Posted: **Fri Jul 06, 2007 4:23 pm**

How fast did my version run on your system?

Well. Like you said a lot more problems show their face. One being you need to generate four sequences at a time and intertwine them together in the buffer that the md5_hash4 function reads.

Such that md5_hash4_buffer[x*4+y], where x is the byte position for a individual message and y is the index for each message. Where y can be zero through four (or zero through four of the sequences you generate).

Neither of our implementations are currently setup to generate four sequences, although it would not take long. The only slow down is your implementation is different and it would take long for me to insert it into yours.

At the moment I just insert the md5_hash4 function where the md5_hash function was, and increase the buffer to two-hundred-fifty-six bytes with random garbage in it (garbage from what ever was on the stack when the function created the frame). Then I run it and multiply the loops by four. The whole purpose is just to gather some timing information about the md5_hash4 function.

Well, my existing code is making a little extra garbage as it writes across the intertwined buffers for each message.

If you meant how fast was brute-mt.cc then the answer was 2.9 - 3.0 million per second on my computer. Mine reported 3.0 - 3.1 million which is why I was thinking mine was a little faster. When, Candy comes back online I will login to his dual core box for testing and see how fast they run.

Posted: **Fri Jul 06, 2007 6:14 pm**

Kevin McGuire wrote:Does this one do any better? I am too afraid to report my findings since the last time I pursued what I thought was faster was not.

Compile
gcc md5.c -o md5 -O3
Options
md5 [hash] [minimum-length] [maximum-length] [thread-count]

Try it with the hash:
d6a6bc0db10694a2d90e3a69648f3a03 = hacker (longer run time)

Also multiple threads on a UNI can increase the cracking time by starting at different offsets in the message space.

1 Thread:

Code: Select all

$ time ./md5 d6a6bc0db10694a2d90e3a69648f3a03 6 6 1
initializing..
digest_target is 0dbca6d6a29406b1693a0ed9033a8f64
cm:abcdefghijklmnopqrstuvwxyz
running
time: 64.460000
h/s:3164.573612
(1) I found a collision!
message:hacker
g_thread_active_cnt:0

real    1m5.457s
user    1m4.366s
sys     0m0.124s

2 Threads: (Dual core)

Code: Select all

$ time ./md5 d6a6bc0db10694a2d90e3a69648f3a03 6 6 2
initializing..
digest_target is 0dbca6d6a29406b1693a0ed9033a8f64
cm:abcdefghijcm:kalbmcndoepfqgrhsitjukvlwmxnyozp
running
qrstuvwxyz
running
time: 33.588000
h/s:1474.649488
(2) I found a collision!
message:hacker
g_thread_active_cnt:1

real    0m17.840s
user    0m33.587s
sys     0m0.062s

Heres where it gets interesting. 3 threads

Code: Select all

$ time ./md5 d6a6bc0db10694a2d90e3a69648f3a03 6 6 3
initializing..
digest_target is 0dbca6d6a29406b1693a0ed9033a8f64
cm:abcdefghijklmnopqrstuvwxyz
running
cm:cm:aabcdefghijklmnopqrstuvwxyz
running
bcdefghijklmnopqrstuvwxyz
running
time: 18.548000
h/s:748.673658
(3) I found a collision!
message:hacker
g_thread_active_cnt:2

real    0m9.844s
user    0m18.610s
sys     0m0.092s

I think that everyone is trying to optimize their code for 'hacker' and it will produce even worse results when someone tries to crack something else.

Posted: **Fri Jul 06, 2007 6:20 pm**

good point, maybe a pass like zzzzz would fix that? because i think the threads might be starting at: hacaaa, then getting it instantly

Posted: **Fri Jul 06, 2007 7:13 pm**

I think that everyone is trying to optimize their code for 'hacker' and it will produce even worse results when someone tries to crack something else.

(Frank, when you read this: I am just attempting to ensure that one of my biggest pet peeves is understood so that no misinterpretation can be made. Not saying what you said is quote on quote wrong, just a little opened for interpretation by 'some' people.)
No, I am a very professional person except for the occasional emotionally delusional righteousness outbursts. I in no way have or ever will make any optimization decisions based on the hash I am finding a collision for. The result is purely coincidental. The same applies in any work of mine. I truly believe in intellectual honesty, and have a hate towards the inverse.

When optimizing that this morning I was looking at the hashes per second, and noticed that mine also completed very quickly. I was hoping that osdev64 had made a major flaw in his multi-thread implementation. The timing results from your post earlier, Frank, looked exactly like the problem I was having early on with my implementation. I should have taken a inspection of his code as a remedy for that situation, which is further more an example of my high quality of honesty. When I do something I like to do it - not someone else.

There is no easy way to optimize the code just for hacker with out leaving a visible mark in the implementation. I used the most natural method for splitting the message space for each thread. The exact same sequence_next function is used across all of our implementations, what I forgot to think about was the splitting of the message being differently done.

I think that everyone is trying to optimize their code for 'hacker' and it will produce even worse results when someone tries to crack something else.
Would be better said as:
I think that Kevin is optimizing his code with out using a wide variety of message hashes to circumvent the possibility of reporting a misleading result.

good point, maybe a pass like zzzzz would fix that? because i think the threads might be starting at: hacaaa, then getting it instantly

Thats all relative to the order of testing for coverage of a key space, which could simply be changed with the order of the characters in the character map. Not really a very interesting mechanism for what we are trying to do. This is exactly what osdev64 mentioned to me earlier.

If you take a look at my code you should be able to determine that it is readable and well laid out which would be the opposite of a disorganized quick and dirty lets get it working attempt. It was just purely coincidental. No saying anyone else's was that way just implying that mine most certainly is not.

I am too afraid to report my findings since the last time I pursued what I thought was faster was not.

It was faster in hashes per second single threaded on my computer, and faster on yours single threaded. You even use the one with the clock() call, while later I changed it to gettimeofday which also produces the same results.

I wish I could say and I feel I should be able to say it was faster with two threads, but doing the hashes per second multiplied by two does not produce but exactly the same hashes per second as the single threaded version which is strange since it is obviously running on the two cores. The user time is double the real time meaning it got twice the normal processor time. It just does not add up sometimes..

And do I sound like a big whinny baby?

Posted: **Fri Jul 06, 2007 7:24 pm**

Also whatever brute-mt.cc is doing is strange. osdev64 most likely does not even realize that the results are strange if they are indeed strange and I am not just having some sort of memory lapse.

2 Threads

Code: Select all

$ time ./brute d6a6bc0db10694a2d90e3a69648f3a03 6 2
threadList[t].sequence[0]: 0
threadList[t].sequence[0]: 13
Collision Found!
hash[d6a6bc0db10694a2d90e3a69648f3a03] = 'hacker'
time: 38.87s
- avg. hash/s: 4493377.81 h/s
#done.

real    0m38.977s
user    1m14.256s
sys     0m0.093s

1 Thread

Code: Select all

$ time ./brute d6a6bc0db10694a2d90e3a69648f3a03 6 1
threadList[t].sequence[0]: 0
Collision Found!
hash[d6a6bc0db10694a2d90e3a69648f3a03] = 'hacker'
time: 26.98s
- avg. hash/s: 3084769.79 h/s
#done.

real    0m27.098s
user    0m25.864s
sys     0m0.139s

The one thread shows a couple hundred thousand hashes slower than mine, but when using two threads it only shows 145% percent speed improvement. The brute-mt.cc should gain at least 150% percent improvement if it each thread is on a different core, or it should show 50% of the one thread running hashes per second value if both are ran on the same core.

I can not understand why that is showing. If you look at mine you can tell from Frank's results that all the threads are running on the same core since they are very close mathematically when dividing by 1/2 and 1/3.

[edit]
After reviewing his code I can not find how he is generating the message sequence in a different order than mine, and taking longer to find the message. We both split it right down the middle with two threads.

It might be the gettimeofday function would will report incorrect results since it does not account for time when another process is running on the processor.

Posted: **Fri Jul 06, 2007 8:25 pm**

Such that md5_hash4_buffer[x*4+y], where x is the byte position for a individual message and y is the index for each message. Where y can be zero through four (or zero through four of the sequences you generate).

I have to make a correction here for the SSE instructions. It would actually be: md5buffer[(bytepos/4)*16+messageIndex*4+(bytepos%4)]

Posted: **Fri Jul 06, 2007 9:46 pm**

I might be able to find some more things to squeeze a little out of it. The ceiling will be 3.9 million in my core loop. Adding back in my code drops to 3.3 million. I feel I will be able to only get it between these two numbers, unless the core computational loop and design has a radical change.

The generic function (non SSE2) gets 3.1 million, not sure if SSE2 will become worth using.

Posted: **Fri Jul 06, 2007 10:04 pm**

@everyone
The comment I made about the optimizing for a certain hash was not meant as Kevin took it. It was merely a suggestion to use more hashes when testing to eliminate the chance of optimizing towards 1 particular hash.

@Kevin
I can't test this latest version. I apparently lack asm/processor.h.

Posted: **Fri Jul 06, 2007 11:11 pm**

I can't test this latest version. I apparently lack asm/processor.h.

Opps. I should have told that in my post above. That one is not functional as in it will never find the correct hash, because of debugging values left in. It would work if someone replaced the values where I have ones. The reason I posted it in that state was just in case someone wanted to take a look at it, such as odev64 or GLneo - but I think they are satisfied with their own results. I abandoned using the SSE since the performance increase was not what I wanted.

It is most likely good that processor.h was missing, otherwise you would have just experienced a dud for a program, but I do appreciate you testing it and the previous one on your dual core!

Posted: **Fri Jul 06, 2007 11:56 pm**

frank wrote:@everyone
The comment I made about the optimizing for a certain hash was not meant as Kevin took it. It was merely a suggestion to use more hashes when testing to eliminate the chance of optimizing towards 1 particular hash.

I am never looking at the total application time only the hashes per second. 'hacker' i use because it used as a reference method on this forum. if i use a password with ends on an a or a password that ends on a z it means a factor 26 in the total application time. Yet the hashes per second remain the same. therefore the hashes are the only interesting data.

@kevin
I do understand the problem with the scalability but i havent look into it yet. problably i'll will do a code rewrite and make it more readable.

Posted: **Sat Jul 07, 2007 3:18 pm**

os64dev wrote:Personally i am very iterested in the results of the multithreaded version on Candy's machine.

I'll get the box running tomorrow morning first thing, as soon as I get home again. About 11:00 GMT I think...

Posted: **Sun Jul 08, 2007 7:16 am**

ok, i rewrote part of the code to increase cache usage. removed some bugs and optimised the code for a key length < 31, because it is very unlike someone has a password that long and no one will wait for the brute forcer to finishe that long a sequence. I dunno if the cache improvement will work on all systems, but my laptop gave a rather pleasant performance boost of roughly 35%. aslo the scalabilty issues seem to be fixed.

the old version gave.

$ time ./brute-old.exe d6a6bc0db10694a2d90e3a69648f3a03 6 2
threadList[t].sequence[0]: 0
threadList[t].sequence[0]: 13
Collision Found!
hash[d6a6bc0db10694a2d90e3a69648f3a03] = 'hacker'
time: 34.97s
- avg. hash/s: 4733603.01 h/s
#done.

real 0m35.453s
user 1m9.968s
sys 0m0.046s

new version:

$ time ./brute.exe d6a6bc0db10694a2d90e3a69648f3a03 6 2
threadList[t].sequence[0]: 0
threadList[t].sequence[0]: 13
Collision Found!
hash[d6a6bc0db10694a2d90e3a69648f3a03] = 'hacker'
time: 24.09s
- avg. hash/s: 6892816.59 h/s
#done.

real 0m24.625s
user 0m48.468s
sys 0m0.108s

Posted: **Sun Jul 08, 2007 7:26 am**

Message from syslogd@blackbox at Sun Jul 8 15:44:50 2007 ...
blackbox kernel: CPU1: Temperature above threshold

Message from syslogd@blackbox at Sun Jul 8 15:44:50 2007 ...
blackbox kernel: CPU1: Running in modulated clock mode
Collision Found!
hash[d6a6bc0db10694a2d90e3a69648f3a03] = 'hacker'
time: 19.07s
- avg. hash/s: 8668813.26 h/s
#done.

real 0m19.074s
user 0m37.760s
sys 0m0.193s

I think I have to check up on the cooling before I continue testing. Still, this is the highest result I've seen by far.