Page 3 of 13

Posted: Mon Jul 02, 2007 12:25 pm
by Ninjarider
can you post the code. im on a work computer and were not allowed to download that kind of stuff.

any error messages you get to would help.

Posted: Mon Jul 02, 2007 6:28 pm
by GLneo
well the code is:

Code: Select all

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>


#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))

void md5_genT(uint32_t *T)
{
   uint_fast32_t i;
   for(i = 0; i < 64; ++i)
   {
      T[i] = 4294967296.0 * fabs(sin((float)(i+1)));
      //printf("%08x\n", T[i]);
      //sleep(1);
   }
}
void md5_hash(uint8_t *message, uint_fast32_t mlength, uint32_t *T)
{
   uint8_t *nm, nmlength;
   uint32_t __s1[] = {7, 12, 17, 22};
   uint32_t __s2[] = {5, 9, 14, 20};
   uint32_t __s3[] = {4, 11, 16, 23};
   uint32_t __s4[] = {6, 10, 15, 21};
   uint32_t __k1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   uint32_t __k2[] = {1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12};
   uint32_t __k3[] = {5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2};
   uint32_t __k4[] = {0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9};
   uint32_t X[16];
   uint32_t i, j, ii, k;
   uint32_t AA, BB, CC, DD;
   uint32_t A, B, C, D;
   // do padding so message is a multiple of 512 bits (64 bytes)
   if((mlength%64) == 0)
   {
      nmlength = mlength + 64;
   }else{
      nmlength = mlength + (64 - (mlength%64));
   }
   nm = (uint8_t*)malloc(nmlength);
   memset(nm, 0, nmlength);
   memcpy(nm, message, mlength);
   printf("nmlength:%u mlength:%u\n", nmlength, mlength);
   nm[mlength] = 0x80;
   ((uint32_t*)&nm[nmlength - 8])[0] = mlength << 3;
   ((uint32_t*)&nm[nmlength - 8])[1] = 0;
   
   // hash padded message (nmessage)
   AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
   for(i = 0; i < (nmlength/64); ++i)
   {
      // create table
      for(j = 0; j < 16; ++j)
      {
         X[j] =     nm[i*64+j*4+0] |
               (nm[i*64+j*4+1] << 8) |
               (nm[i*64+j*4+2] << 16) |
               (nm[i*64+j*4+3] << 24);
      }
      A = AA;
      B = BB;
      C = CC;
      D = DD;
      // round one
      ii = 0;
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + F(B, C, D) + X[__k1[k*4+0]] + T[ii+0]), __s1[0]);
         D = A + ROTATE_LEFT((D + F(A, B, C) + X[__k1[k*4+1]] + T[ii+1]), __s1[1]);
         C = D + ROTATE_LEFT((C + F(D, A, B) + X[__k1[k*4+2]] + T[ii+2]), __s1[2]);
         B = C + ROTATE_LEFT((B + F(C, D, A) + X[__k1[k*4+3]] + T[ii+3]), __s1[3]);
         printf("X[..]: %u\n", __k1[k*4+0]);
         ii += 4;
      }
      // round two
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + G(B, C, D) + X[__k2[k*4+0]] + T[ii+0]), __s2[0]);
         D = A + ROTATE_LEFT((D + G(A, B, C) + X[__k2[k*4+1]] + T[ii+1]), __s2[1]);
         C = D + ROTATE_LEFT((C + G(D, A, B) + X[__k2[k*4+2]] + T[ii+2]), __s2[2]);
         B = C + ROTATE_LEFT((B + G(C, D, A) + X[__k2[k*4+3]] + T[ii+3]), __s2[3]);
         ii += 4;
      }
      // round three
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + H(B, C, D) + X[__k3[k*4+0]] + T[ii+0]), __s3[0]);
         D = A + ROTATE_LEFT((D + H(A, B, C) + X[__k3[k*4+1]] + T[ii+1]), __s3[1]);
         C = D + ROTATE_LEFT((C + H(D, A, B) + X[__k3[k*4+2]] + T[ii+2]), __s3[2]);
         B = C + ROTATE_LEFT((B + H(C, D, A) + X[__k3[k*4+3]] + T[ii+3]), __s3[3]);
         ii += 4;
      }
      // round four
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + I(B, C, D) + X[__k4[k*4+0]] + T[ii+0]), __s4[0]);
         D = A + ROTATE_LEFT((D + I(A, B, C) + X[__k4[k*4+1]] + T[ii+1]), __s4[2]);
         C = D + ROTATE_LEFT((C + I(D, A, B) + X[__k4[k*4+2]] + T[ii+2]), __s4[3]);
         B = C + ROTATE_LEFT((B + I(C, D, A) + X[__k4[k*4+3]] + T[ii+3]), __s4[4]);
         ii += 4;
      }
      printf("----\n");
      printf("ii:%u\n", ii);
      AA = AA + A;
      BB = BB + B;
      CC = CC + C;
      DD = DD + D;
   }
   printf("digest: ");
   printf("%02x%02x%02x%02x", AA>>24, AA>>16&0xFF, AA>>8&0xFF, AA&0xFF);
   printf("%02x%02x%02x%02x", BB>>24, BB>>16&0xFF, BB>>8&0xFF, BB&0xFF);
   printf("%02x%02x%02x%02x", CC>>24, CC>>16&0xFF, CC>>8&0xFF, CC&0xFF);
   printf("%02x%02x%02x%02x", DD>>24, DD>>16&0xFF, DD>>8&0xFF, DD&0xFF);
   printf("\n");
   return;
}

int main(int argc, char *argv[])
{
   uint8_t *message = (uint8_t*)"abc";
   uint32_t T[64];
   md5_genT(&T[0]);
   md5_hash(message, strlen(message), &T[0]);
   return 1;
} 
it should make 900150983cd24fb0d6963f7d28e17f72, but it doesn't, i still dont understand a all md5 hashing, so i am having a hard time debugging this.

thx

Posted: Mon Jul 02, 2007 7:22 pm
by Kevin McGuire
I see it. Look at round four where it uses a index for __s4.

Here is a version that produces the hash as you have above.

Code: Select all

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>


#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))

void md5_genT(uint32_t *T)
{
   uint_fast32_t i;
   for(i = 0; i < 64; ++i)
   {
      T[i] = 4294967296.0 * fabs(sin((float)(i+1)));
      //printf("%08x\n", T[i]);
      //sleep(1);
   }
}
void md5_hash(uint8_t *message, uint_fast32_t mlength, uint32_t *T)
{
   uint8_t *nm, nmlength;
   uint32_t __s1[] = {7, 12, 17, 22};
   uint32_t __s2[] = {5, 9, 14, 20};
   uint32_t __s3[] = {4, 11, 16, 23};
   uint32_t __s4[] = {6, 10, 15, 21};
   uint32_t __k1[] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
   uint32_t __k2[] = {1, 6, 11, 0, 5, 10, 15, 4, 9, 14, 3, 8, 13, 2, 7, 12};
   uint32_t __k3[] = {5, 8, 11, 14, 1, 4, 7, 10, 13, 0, 3, 6, 9, 12, 15, 2};
   uint32_t __k4[] = {0, 7, 14, 5, 12, 3, 10, 1, 8, 15, 6, 13, 4, 11, 2, 9};
   uint32_t X[16];
   uint32_t i, j, ii, k;
   uint32_t AA, BB, CC, DD;
   uint32_t A, B, C, D;
   // do padding so message is a multiple of 512 bits (64 bytes)
   if((mlength%64) == 0)
   {
      nmlength = mlength + 64;
   }else{
      nmlength = mlength + (64 - (mlength%64));
   }
   nm = (uint8_t*)malloc(nmlength);
   memset(nm, 0, nmlength);
   memcpy(nm, message, mlength);
   printf("nmlength:%u mlength:%u\n", nmlength, mlength);
   nm[mlength] = 0x80;
   ((uint32_t*)&nm[nmlength - 8])[0] = mlength * 8;
   ((uint32_t*)&nm[nmlength - 8])[1] = 0;
   
   printf("nm[0]:%x\n", nmlength);
   // hash padded message (nmessage)
   AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
   for(i = 0; i < (nmlength/64); ++i)
   {
	printf("i:%x\n", i);
      // create table
      for(j = 0; j < 64; j += 4)
      {
         X[j/4] = nm[i*64+j+0] |
               (nm[i*64+j+1] << 8) |
               (nm[i*64+j+2] << 16) |
               (nm[i*64+j+3] << 24);
      }
      A = AA;
      B = BB;
      C = CC;
      D = DD;
      // round one
      ii = 0;
      for(k = 0; k < 4; ++k)
      {
	   printf("T[ii+0]:%x\n", T[ii+0]);
         A = B + ROTATE_LEFT((A + F(B, C, D) + X[__k1[k*4+0]] + T[ii+0]), __s1[0]);
         D = A + ROTATE_LEFT((D + F(A, B, C) + X[__k1[k*4+1]] + T[ii+1]), __s1[1]);
         C = D + ROTATE_LEFT((C + F(D, A, B) + X[__k1[k*4+2]] + T[ii+2]), __s1[2]);
         B = C + ROTATE_LEFT((B + F(C, D, A) + X[__k1[k*4+3]] + T[ii+3]), __s1[3]);
         ii += 4;
      }
      // round two
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + G(B, C, D) + X[__k2[k*4+0]] + T[ii+0]), __s2[0]);
         D = A + ROTATE_LEFT((D + G(A, B, C) + X[__k2[k*4+1]] + T[ii+1]), __s2[1]);
         C = D + ROTATE_LEFT((C + G(D, A, B) + X[__k2[k*4+2]] + T[ii+2]), __s2[2]);
         B = C + ROTATE_LEFT((B + G(C, D, A) + X[__k2[k*4+3]] + T[ii+3]), __s2[3]);
         ii += 4;
      }
      // round three
      for(k = 0; k < 4; ++k)
      {
         A = B + ROTATE_LEFT((A + H(B, C, D) + X[__k3[k*4+0]] + T[ii+0]), __s3[0]);
         D = A + ROTATE_LEFT((D + H(A, B, C) + X[__k3[k*4+1]] + T[ii+1]), __s3[1]);
         C = D + ROTATE_LEFT((C + H(D, A, B) + X[__k3[k*4+2]] + T[ii+2]), __s3[2]);
         B = C + ROTATE_LEFT((B + H(C, D, A) + X[__k3[k*4+3]] + T[ii+3]), __s3[3]);
         ii += 4;
      }
      // round four
      for(k = 0; k < 4; ++k)
      {
		printf("__k4[k*4+0]:%u\n", __k4[k*4+0]);
         A = B + ROTATE_LEFT((A + I(B, C, D) + X[__k4[k*4+0]] + T[ii+0]), __s4[0]);
         D = A + ROTATE_LEFT((D + I(A, B, C) + X[__k4[k*4+1]] + T[ii+1]), __s4[1]);
         C = D + ROTATE_LEFT((C + I(D, A, B) + X[__k4[k*4+2]] + T[ii+2]), __s4[2]);
         B = C + ROTATE_LEFT((B + I(C, D, A) + X[__k4[k*4+3]] + T[ii+3]), __s4[3]);
         ii += 4;
      }
      printf("----\n");
      printf("ii:%u\n", ii);
      AA = AA + A;
      BB = BB + B;
      CC = CC + C;
      DD = DD + D;
   }
   printf("digest: ");
   printf("%02x%02x%02x%02x,", AA&0xFF, AA>>8&0xFF, AA>>16&0xFF, AA>>24&0xFF);
   printf("%02x%02x%02x%02x,", BB&0xFF, BB>>8&0xFF, BB>>16&0xFF, BB>>24&0xFF);
   printf("%02x%02x%02x%02x,", CC&0xFF, CC>>8&0xFF, CC>>16&0xFF, CC>>24&0xFF);
   printf("%02x%02x%02x%02x,", DD&0xFF, DD>>8&0xFF, DD>>16&0xFF, DD>>24&0xFF);
   printf("\n");
   return;
}

int main(int argc, char *argv[])
{
   uint8_t *message = (uint8_t*)"abc";
   uint32_t T[64];
   md5_genT(&T[0]);
   md5_hash(message, strlen(message), &T[0]);
   return 1;
} 

Posted: Mon Jul 02, 2007 7:47 pm
by Ninjarider
question not sure about something. in a for loop is it not suppose to be
for(i=0; i<62; i++)
{
}

you gave what it is suppose to print.
what is is printing

Posted: Mon Jul 02, 2007 8:11 pm
by Kevin McGuire
It is printing the least significant byte first, then the next - for A, B, C, and D.

Posted: Mon Jul 02, 2007 8:20 pm
by Ninjarider
so it is printing a, b, c, d when it needs to be d, c, b, a

Posted: Mon Jul 02, 2007 8:26 pm
by Kevin McGuire
I am lost.

Posted: Mon Jul 02, 2007 8:41 pm
by Ninjarider
It is printing the least significant byte first, then the next - for A, B, C, and D
is it printing the least significant (bit) or (byte).

ie. if a = 0x0fh, b = 0x0ch, c = 0xc0h, d= 0x00h

is it printing 0x00c00c0fh (d, c, b, a)

or is it printing 0xf0f33f00h (not (a, b, c, d))

Posted: Tue Jul 03, 2007 4:11 am
by Kevin McGuire
Yes.

a = 0x12345678;
printf("%x\n", a);
Will print 12345678.
printf("%x%x%x%x", a&0xff, a>>8&0xff, a>>16&0xff, a>>24&0xff);
Will print 78563412.


The least significant byte first.

Posted: Tue Jul 03, 2007 12:01 pm
by GLneo
well, it works!, but it is slow, about ~600,000 h/s ( other libs are ~1,200,000 h/s), but this is ok because it has the possibility to be faster than any general porpoise md5 lib. now what it needs is optimizations, first the T values could be pre-computed:

Code: Select all

#define T_MASK ((md5_word_t)~0)
#define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87)
#define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9)
#define T3    0x242070db
#define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111)
#define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050)
#define T6    0x4787c62a
#define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec)
#define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe)
#define T9    0x698098d8
#define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850)
#define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e)
#define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841)
#define T13    0x6b901122
#define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c)
#define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71)
#define T16    0x49b40821
#define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d)
#define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf)
#define T19    0x265e5a51
#define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855)
#define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2)
#define T22    0x02441453
#define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e)
#define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437)
#define T25    0x21e1cde6
#define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829)
#define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278)
#define T28    0x455a14ed
#define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa)
#define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07)
#define T31    0x676f02d9
#define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375)
#define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd)
#define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e)
#define T35    0x6d9d6122
#define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3)
#define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb)
#define T38    0x4bdecfa9
#define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f)
#define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f)
#define T41    0x289b7ec6
#define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805)
#define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a)
#define T44    0x04881d05
#define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6)
#define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a)
#define T47    0x1fa27cf8
#define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a)
#define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb)
#define T50    0x432aff97
#define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58)
#define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6)
#define T53    0x655b59c3
#define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d)
#define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82)
#define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e)
#define T57    0x6fa87e4f
#define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f)
#define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb)
#define T60    0x4e0811a1
#define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d)
#define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca)
#define T63    0x2ad7d2bb
#define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e)
also the loops could be unrolled, hmmm... thats all i can think of. :roll:

Posted: Tue Jul 03, 2007 3:53 pm
by Kevin McGuire

Code: Select all

#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
#define F(x,y,z) (((x) & (y)) | ((~(x)) & (z)))
#define G(x,y,z) (((x) & (z)) | ((y) & (~(z))))
#define H(x,y,z) ((x) ^ (y) ^ (z))
#define I(x,y,z) ((y) ^ ((x) | (~(z))))
#define md5_s1_0 7
#define md5_s1_1 12
#define md5_s1_2 17
#define md5_s1_3 22
#define md5_s2_0 5
#define md5_s2_1 9 
#define md5_s2_2 14
#define md5_s2_3 20
#define md5_s3_0 4
#define md5_s3_1 11
#define md5_s3_2 16
#define md5_s3_3 23
#define md5_s4_0 6
#define md5_s4_1 10
#define md5_s4_2 15
#define md5_s4_3 21
void md5_hash(uint8_t *message, uint_fast32_t mlength)
{
	uint32_t 	*nm32;
	uint8_t	*nm8;
	uint32_t 	nmlength;
	uint32_t 	i, j;
	uint32_t 	AA, BB, CC, DD;
	uint32_t 	A, B, C, D;
	uint32_t 	X[16];
	// do padding so message is a multiple of 512 bits (64 bytes)
	if((mlength%64) == 0)
	{
		nmlength = mlength + 64;
	}else{
		nmlength = mlength + (64 - (mlength%64));
	}
	nm8 = (uint8_t*)malloc(nmlength);
	nm32 = (uint32_t*)nm8;
	// could gain speed doing this someother way.. (memset and memcpy)
	memset(nm8, 0, nmlength);
	memcpy(nm8, message, mlength);
	// add a bit one after message, pad remaining with zeros.
	nm8[mlength] = 0x80;
	// append length in bits to message (low word first)
	nm32[(nmlength/4) - 2] = mlength * 8;
	nm32[(nmlength/4) - 1] = 0;
	// hash padded message (nmessage)
	AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
	for(i = 0; i < (nmlength/64); ++i)
	{
		// move block from arbitrary memory location to stack. (could gain speed changing this)
		for(j = 0; j < 16; ++j)
		{
			X[j] = nm32[i*64+j];
		}
		A = AA;
		B = BB;
		C = CC;
		D = DD;
		/// round one (unrolled)
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), md5_s1_3);
		A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), md5_s1_0);
		D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), md5_s1_1);
		C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), md5_s1_2);
		B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), md5_s1_3);
		/// round two (unrolled)
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), md5_s2_3);
		A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), md5_s2_0);
		D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), md5_s2_1);
		C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), md5_s2_2);
		B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), md5_s2_3);
		/// round three (unrolled)
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), md5_s3_3);
		A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), md5_s3_0);
		D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), md5_s3_1);
		C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), md5_s3_2);
		B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), md5_s3_3);
		/// round four (unrolled)
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), md5_s4_3);
		A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), md5_s4_0);
		D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), md5_s4_1);
		C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), md5_s4_2);
		B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), md5_s4_3);
		AA = AA + A;
		BB = BB + B;
		CC = CC + C;
		DD = DD + D;
	}
	// release allocated memory.
	free(nm8);

	/// (REMOVE US PRINT FUNCTIONS DURING SPEED TESTING)
	printf("digest: ");
	printf("%02x%02x%02x%02x,", AA&0xFF, AA>>8&0xFF, AA>>16&0xFF, AA>>24&0xFF);
	printf("%02x%02x%02x%02x,", BB&0xFF, BB>>8&0xFF, BB>>16&0xFF, BB>>24&0xFF);
	printf("%02x%02x%02x%02x,", CC&0xFF, CC>>8&0xFF, CC>>16&0xFF, CC>>24&0xFF);
	printf("%02x%02x%02x%02x,", DD&0xFF, DD>>8&0xFF, DD>>16&0xFF, DD>>24&0xFF);
	printf("\n");
	return;
}

int main(int argc, char *argv[])
{
   uint8_t *message = (uint8_t*)"abc";
   md5_hash(message, strlen(message));
   return 1;
} 

Posted: Tue Jul 03, 2007 4:32 pm
by GLneo
hay! look at that were in the millions again ~1,119,000 h/s! heres what it looks like:

Code: Select all

#include "md5lib.h"
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

int md5_hash(uint8_t *message, uint_fast32_t mlength, unsigned char input[16])
{
   uint32_t    *nm32;
   uint8_t   *nm8;
   uint32_t    nmlength;
   uint32_t    i, j;
   uint32_t    AA, BB, CC, DD;
   uint32_t    A, B, C, D;
   uint32_t    X[16];
   // do padding so message is a multiple of 512 bits (64 bytes)
   if((mlength%64) == 0)
   {
      nmlength = mlength + 64;
   }else{
      nmlength = mlength + (64 - (mlength%64));
   }
   nm8 = (uint8_t*)malloc(nmlength);
   nm32 = (uint32_t*)nm8;
   // could gain speed doing this someother way.. (memset and memcpy)
   memset(nm8, 0, nmlength);
   memcpy(nm8, message, mlength);
   // add a bit one after message, pad remaining with zeros.
   nm8[mlength] = 0x80;
   // append length in bits to message (low word first)
   nm32[(nmlength/4) - 2] = mlength * 8;
   nm32[(nmlength/4) - 1] = 0;
   // hash padded message (nmessage)
   AA = 0x67452301; BB = 0xefcdab89; CC = 0x98badcfe; DD = 0x10325476;
   for(i = 0; i < (nmlength/64); ++i)
   {
      // move block from arbitrary memory location to stack. (could gain speed changing this)
      for(j = 0; j < 16; ++j)
      {
         X[j] = nm32[i*64+j];
      }
      A = AA;
      B = BB;
      C = CC;
      D = DD;
      /// round one (unrolled)
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[0] + 0xd76aa478), md5_s1_0);
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[1] + 0xe8c7b756), md5_s1_1);
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[2] + 0x242070db), md5_s1_2);
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[3] + 0xc1bdceee), md5_s1_3);
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[4] + 0xf57c0faf), md5_s1_0);
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[5] + 0x4787c62a), md5_s1_1);
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[6] + 0xa8304613), md5_s1_2);
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[7] + 0xfd469501), md5_s1_3);
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[8] + 0x698098d8), md5_s1_0);
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[9] + 0x8b44f7af), md5_s1_1);
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[10] + 0xffff5bb1), md5_s1_2);
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[11] + 0x895cd7be), md5_s1_3);
      A = B + ROTATE_LEFT((A + F(B, C, D) + X[12] + 0x6b901122), md5_s1_0);
      D = A + ROTATE_LEFT((D + F(A, B, C) + X[13] + 0xfd987193), md5_s1_1);
      C = D + ROTATE_LEFT((C + F(D, A, B) + X[14] + 0xa679438e), md5_s1_2);
      B = C + ROTATE_LEFT((B + F(C, D, A) + X[15] + 0x49b40821), md5_s1_3);
      /// round two (unrolled)
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[1] + 0xf61e2562), md5_s2_0);
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[6] + 0xc040b340), md5_s2_1);
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[11] + 0x265e5a51), md5_s2_2);
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[0] + 0xe9b6c7aa), md5_s2_3);
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[5] + 0xd62f105d), md5_s2_0);
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[10] + 0x02441453), md5_s2_1);
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[15] + 0xd8a1e681), md5_s2_2);
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[4] + 0xe7d3fbc8), md5_s2_3);
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[9] + 0x21e1cde6), md5_s2_0);
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[14] + 0xc33707d6), md5_s2_1);
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[3] + 0xf4d50d87), md5_s2_2);
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[8] + 0x455a14ed), md5_s2_3);
      A = B + ROTATE_LEFT((A + G(B, C, D) + X[13] + 0xa9e3e905), md5_s2_0);
      D = A + ROTATE_LEFT((D + G(A, B, C) + X[2] + 0xfcefa3f8), md5_s2_1);
      C = D + ROTATE_LEFT((C + G(D, A, B) + X[7] + 0x676f02d9), md5_s2_2);
      B = C + ROTATE_LEFT((B + G(C, D, A) + X[12] + 0x8d2a4c8a), md5_s2_3);
      /// round three (unrolled)
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[5] + 0xfffa3942), md5_s3_0);
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[8] + 0x8771f681), md5_s3_1);
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[11] + 0x6d9d6122), md5_s3_2);
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[14] + 0xfde5380c), md5_s3_3);
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[1] + 0xa4beea44), md5_s3_0);
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[4] + 0x4bdecfa9), md5_s3_1);
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[7] + 0xf6bb4b60), md5_s3_2);
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[10] + 0xbebfbc70), md5_s3_3);
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[13] + 0x289b7ec6), md5_s3_0);
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[0] + 0xeaa127fa), md5_s3_1);
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[3] + 0xd4ef3085), md5_s3_2);
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[6] + 0x04881d05), md5_s3_3);
      A = B + ROTATE_LEFT((A + H(B, C, D) + X[9] + 0xd9d4d039), md5_s3_0);
      D = A + ROTATE_LEFT((D + H(A, B, C) + X[12] + 0xe6db99e5), md5_s3_1);
      C = D + ROTATE_LEFT((C + H(D, A, B) + X[15] + 0x1fa27cf8), md5_s3_2);
      B = C + ROTATE_LEFT((B + H(C, D, A) + X[2] + 0xc4ac5665), md5_s3_3);
      /// round four (unrolled)
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[0] + 0xf4292244), md5_s4_0);
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[7] + 0x432aff97), md5_s4_1);
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[14] + 0xab9423a7), md5_s4_2);
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[5] + 0xfc93a039), md5_s4_3);
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[12] + 0x655b59c3), md5_s4_0);
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[3] + 0x8f0ccc92), md5_s4_1);
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[10] + 0xffeff47d), md5_s4_2);
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[1] + 0x85845dd1), md5_s4_3);
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[8] + 0x6fa87e4f), md5_s4_0);
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[15] + 0xfe2ce6e0), md5_s4_1);
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[6] + 0xa3014314), md5_s4_2);
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[13] + 0x4e0811a1), md5_s4_3);
      A = B + ROTATE_LEFT((A + I(B, C, D) + X[4] + 0xf7537e82), md5_s4_0);
      D = A + ROTATE_LEFT((D + I(A, B, C) + X[11] + 0xbd3af235), md5_s4_1);
      C = D + ROTATE_LEFT((C + I(D, A, B) + X[2] + 0x2ad7d2bb), md5_s4_2);
      B = C + ROTATE_LEFT((B + I(C, D, A) + X[9] + 0xeb86d391), md5_s4_3);
      AA = AA + A;
      BB = BB + B;
      CC = CC + C;
      DD = DD + D;
   }
   // release allocated memory.
   free(nm8);

   /// (REMOVE US PRINT FUNCTIONS DURING SPEED TESTING)
   if((*(unsigned long *)(input) == (AA)) &&
   (*(unsigned long *)(input+4) == (BB)) &&
   (*(unsigned long *)(input+8) == (CC)) &&
   (*(unsigned long *)(input+12) == (DD)))
       return 1;
   return 0;
} 

( the defines are in the header, and the function now returns true if "input = md5(message)" ) hmm.. maybe getting rid of the system calls could help, then the biggest one: ASM! thought that will be a pain :P

p.s. i have a stupid question, my CPU is 1.4GHz and if this program calculates 1M h/s, doesn't that mean it sums an md5 in just 1000 instruction?

Posted: Tue Jul 03, 2007 5:43 pm
by Kevin McGuire
Nope, all instructions are not one cycle on a CISC processor.

Did you try compiling it with -O2, -Os, or -O3 and also -march i686?

Posted: Tue Jul 03, 2007 6:21 pm
by Kevin McGuire
Give me a little bit and I will tell you how to split the sequence among processors, and use the POSIX threading library.

I am having a little trouble with my math at the moment.

a^b = c

Where I need to find a. I can find b using c^(1/a). I am sure it is on the internet or in a textbook somewhere but I am trying to exercise my brain on it. I will get it before long.

Posted: Tue Jul 03, 2007 6:36 pm
by GLneo
a = b ^ c