diff --git a/aes.c b/aes.c
index 462cd44..4a8d660 100644
--- a/aes.c
+++ b/aes.c
@@ -1,4 +1,4 @@
-/* AES implementation by Tom St Denis 
+/* AES implementation by Tom St Denis
  *
  * Derived from the Public Domain source code by
  
@@ -48,7 +48,7 @@ const struct _cipher_descriptor aes_desc =
 int rijndael_setup(const unsigned char *key, int keylen, int rounds, symmetric_key *skey)
 {
     int i = 0, j;
-    unsigned long temp, *rk, *rrk;
+    ulong32 temp, *rk, *rrk;
     
     _ARGCHK(key != NULL);
     _ARGCHK(skey != NULL);
@@ -235,7 +235,7 @@ int rijndael_setup(const unsigned char *key, int keylen, int rounds, symmetric_k
 
 void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *skey) 
 {
-    unsigned long s0, s1, s2, s3, t0, t1, t2, t3, *rk;
+    ulong32 s0, s1, s2, s3, t0, t1, t2, t3, *rk;
     int Nr, r;
    
     _ARGCHK(pt != NULL);
@@ -261,13 +261,6 @@ void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_
     for (;;) {
 
 /* Both of these blocks are equivalent except the top is more friendlier for x86 processors */
-#if defined(__GNUC__)
-        t0 = rk[4]; t1 = rk[5]; t2 = rk[6]; t3 = rk[7];
-        t1 ^= Te3[byte(s0, 0)]; t2 ^= Te2[byte(s0, 1)]; t3 ^= Te1[byte(s0, 2)]; t0 ^= Te0[byte(s0, 3)];
-        t2 ^= Te3[byte(s1, 0)]; t3 ^= Te2[byte(s1, 1)]; t0 ^= Te1[byte(s1, 2)]; t1 ^= Te0[byte(s1, 3)];
-        t3 ^= Te3[byte(s2, 0)]; t0 ^= Te2[byte(s2, 1)]; t1 ^= Te1[byte(s2, 2)]; t2 ^= Te0[byte(s2, 3)];
-        t0 ^= Te3[byte(s3, 0)]; t1 ^= Te2[byte(s3, 1)]; t2 ^= Te1[byte(s3, 2)]; t3 ^= Te0[byte(s3, 3)];
-#else
         t0 =
             Te0[byte(s0, 3)] ^
             Te1[byte(s1, 2)] ^
@@ -292,21 +285,12 @@ void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_
             Te2[byte(s1, 1)] ^
             Te3[byte(s2, 0)] ^
             rk[7];
-#endif
-       
+
         rk += 8;
         if (--r == 0) {
             break;
         }
-        
-/* this second half optimization actually makes it slower on the Athlon, use with caution. */
-#if 0
-        s1 = rk[1]; s2 = rk[2]; s3 = rk[3]; s0 = rk[0]; 
-        s1 ^= Te3[byte(t0, 0)]; s2 ^= Te2[byte(t0, 1)]; s3 ^= Te1[byte(t0, 2)]; s0 ^= Te0[byte(t0, 3)];
-        s2 ^= Te3[byte(t1, 0)]; s3 ^= Te2[byte(t1, 1)]; s0 ^= Te1[byte(t1, 2)]; s1 ^= Te0[byte(t1, 3)];
-        s3 ^= Te3[byte(t2, 0)]; s0 ^= Te2[byte(t2, 1)]; s1 ^= Te1[byte(t2, 2)]; s2 ^= Te0[byte(t2, 3)];
-        s0 ^= Te3[byte(t3, 0)]; s1 ^= Te2[byte(t3, 1)]; s2 ^= Te1[byte(t3, 2)]; s3 ^= Te0[byte(t3, 3)];
-#else
+
         s0 =
             Te0[byte(t0, 3)] ^
             Te1[byte(t1, 2)] ^
@@ -331,7 +315,6 @@ void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_
             Te2[byte(t1, 1)] ^
             Te3[byte(t2, 0)] ^
             rk[3];
-#endif            
     }
     /*
      * apply last round and
@@ -368,7 +351,7 @@ void rijndael_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_
 }
 
 void rijndael_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *skey) {
-    unsigned long s0, s1, s2, s3, t0, t1, t2, t3, *rk;
+    ulong32 s0, s1, s2, s3, t0, t1, t2, t3, *rk;
     int Nr, r;
 
     _ARGCHK(pt != NULL);
diff --git a/aes_tab.c b/aes_tab.c
index 78860c3..03a81ef 100644
--- a/aes_tab.c
+++ b/aes_tab.c
@@ -13,7 +13,7 @@ Td3[x] = Si[x].[09, 0d, 0b, 0e];
 Td4[x] = Si[x].[01, 01, 01, 01];
 */
 
-static const unsigned long Te0[256] = {
+static const ulong32 Te0[256] = {
     0xc66363a5UL, 0xf87c7c84UL, 0xee777799UL, 0xf67b7b8dUL,
     0xfff2f20dUL, 0xd66b6bbdUL, 0xde6f6fb1UL, 0x91c5c554UL,
     0x60303050UL, 0x02010103UL, 0xce6767a9UL, 0x562b2b7dUL,
@@ -79,7 +79,7 @@ static const unsigned long Te0[256] = {
     0x824141c3UL, 0x299999b0UL, 0x5a2d2d77UL, 0x1e0f0f11UL,
     0x7bb0b0cbUL, 0xa85454fcUL, 0x6dbbbbd6UL, 0x2c16163aUL,
 };
-static const unsigned long Te1[256] = {
+static const ulong32 Te1[256] = {
     0xa5c66363UL, 0x84f87c7cUL, 0x99ee7777UL, 0x8df67b7bUL,
     0x0dfff2f2UL, 0xbdd66b6bUL, 0xb1de6f6fUL, 0x5491c5c5UL,
     0x50603030UL, 0x03020101UL, 0xa9ce6767UL, 0x7d562b2bUL,
@@ -145,7 +145,7 @@ static const unsigned long Te1[256] = {
     0xc3824141UL, 0xb0299999UL, 0x775a2d2dUL, 0x111e0f0fUL,
     0xcb7bb0b0UL, 0xfca85454UL, 0xd66dbbbbUL, 0x3a2c1616UL,
 };
-static const unsigned long Te2[256] = {
+static const ulong32 Te2[256] = {
     0x63a5c663UL, 0x7c84f87cUL, 0x7799ee77UL, 0x7b8df67bUL,
     0xf20dfff2UL, 0x6bbdd66bUL, 0x6fb1de6fUL, 0xc55491c5UL,
     0x30506030UL, 0x01030201UL, 0x67a9ce67UL, 0x2b7d562bUL,
@@ -211,7 +211,7 @@ static const unsigned long Te2[256] = {
     0x41c38241UL, 0x99b02999UL, 0x2d775a2dUL, 0x0f111e0fUL,
     0xb0cb7bb0UL, 0x54fca854UL, 0xbbd66dbbUL, 0x163a2c16UL,
 };
-static const unsigned long Te3[256] = {
+static const ulong32 Te3[256] = {
 
     0x6363a5c6UL, 0x7c7c84f8UL, 0x777799eeUL, 0x7b7b8df6UL,
     0xf2f20dffUL, 0x6b6bbdd6UL, 0x6f6fb1deUL, 0xc5c55491UL,
@@ -278,7 +278,7 @@ static const unsigned long Te3[256] = {
     0x4141c382UL, 0x9999b029UL, 0x2d2d775aUL, 0x0f0f111eUL,
     0xb0b0cb7bUL, 0x5454fca8UL, 0xbbbbd66dUL, 0x16163a2cUL,
 };
-static const unsigned long Te4[256] = {
+static const ulong32 Te4[256] = {
     0x63636363UL, 0x7c7c7c7cUL, 0x77777777UL, 0x7b7b7b7bUL,
     0xf2f2f2f2UL, 0x6b6b6b6bUL, 0x6f6f6f6fUL, 0xc5c5c5c5UL,
     0x30303030UL, 0x01010101UL, 0x67676767UL, 0x2b2b2b2bUL,
@@ -354,7 +354,7 @@ static const unsigned long Te4[256] = {
 
 #else
 
-static const unsigned long Te4_0[] = {
+static const ulong32 Te4_0[] = {
 0x00000063UL, 0x0000007cUL, 0x00000077UL, 0x0000007bUL, 0x000000f2UL, 0x0000006bUL, 0x0000006fUL, 0x000000c5UL, 
 0x00000030UL, 0x00000001UL, 0x00000067UL, 0x0000002bUL, 0x000000feUL, 0x000000d7UL, 0x000000abUL, 0x00000076UL, 
 0x000000caUL, 0x00000082UL, 0x000000c9UL, 0x0000007dUL, 0x000000faUL, 0x00000059UL, 0x00000047UL, 0x000000f0UL, 
@@ -389,7 +389,7 @@ static const unsigned long Te4_0[] = {
 0x00000041UL, 0x00000099UL, 0x0000002dUL, 0x0000000fUL, 0x000000b0UL, 0x00000054UL, 0x000000bbUL, 0x00000016UL
 };
 
-static const unsigned long Te4_1[] = {
+static const ulong32 Te4_1[] = {
 0x00006300UL, 0x00007c00UL, 0x00007700UL, 0x00007b00UL, 0x0000f200UL, 0x00006b00UL, 0x00006f00UL, 0x0000c500UL, 
 0x00003000UL, 0x00000100UL, 0x00006700UL, 0x00002b00UL, 0x0000fe00UL, 0x0000d700UL, 0x0000ab00UL, 0x00007600UL, 
 0x0000ca00UL, 0x00008200UL, 0x0000c900UL, 0x00007d00UL, 0x0000fa00UL, 0x00005900UL, 0x00004700UL, 0x0000f000UL, 
@@ -424,7 +424,7 @@ static const unsigned long Te4_1[] = {
 0x00004100UL, 0x00009900UL, 0x00002d00UL, 0x00000f00UL, 0x0000b000UL, 0x00005400UL, 0x0000bb00UL, 0x00001600UL
 };
 
-static const unsigned long Te4_2[] = {
+static const ulong32 Te4_2[] = {
 0x00630000UL, 0x007c0000UL, 0x00770000UL, 0x007b0000UL, 0x00f20000UL, 0x006b0000UL, 0x006f0000UL, 0x00c50000UL, 
 0x00300000UL, 0x00010000UL, 0x00670000UL, 0x002b0000UL, 0x00fe0000UL, 0x00d70000UL, 0x00ab0000UL, 0x00760000UL, 
 0x00ca0000UL, 0x00820000UL, 0x00c90000UL, 0x007d0000UL, 0x00fa0000UL, 0x00590000UL, 0x00470000UL, 0x00f00000UL, 
@@ -459,7 +459,7 @@ static const unsigned long Te4_2[] = {
 0x00410000UL, 0x00990000UL, 0x002d0000UL, 0x000f0000UL, 0x00b00000UL, 0x00540000UL, 0x00bb0000UL, 0x00160000UL
 };
 
-static const unsigned long Te4_3[] = {
+static const ulong32 Te4_3[] = {
 0x63000000UL, 0x7c000000UL, 0x77000000UL, 0x7b000000UL, 0xf2000000UL, 0x6b000000UL, 0x6f000000UL, 0xc5000000UL, 
 0x30000000UL, 0x01000000UL, 0x67000000UL, 0x2b000000UL, 0xfe000000UL, 0xd7000000UL, 0xab000000UL, 0x76000000UL, 
 0xca000000UL, 0x82000000UL, 0xc9000000UL, 0x7d000000UL, 0xfa000000UL, 0x59000000UL, 0x47000000UL, 0xf0000000UL, 
@@ -496,7 +496,7 @@ static const unsigned long Te4_3[] = {
 
 #endif
 
-static const unsigned long Td0[256] = {
+static const ulong32 Td0[256] = {
     0x51f4a750UL, 0x7e416553UL, 0x1a17a4c3UL, 0x3a275e96UL,
     0x3bab6bcbUL, 0x1f9d45f1UL, 0xacfa58abUL, 0x4be30393UL,
     0x2030fa55UL, 0xad766df6UL, 0x88cc7691UL, 0xf5024c25UL,
@@ -562,7 +562,7 @@ static const unsigned long Td0[256] = {
     0x39a80171UL, 0x080cb3deUL, 0xd8b4e49cUL, 0x6456c190UL,
     0x7bcb8461UL, 0xd532b670UL, 0x486c5c74UL, 0xd0b85742UL,
 };
-static const unsigned long Td1[256] = {
+static const ulong32 Td1[256] = {
     0x5051f4a7UL, 0x537e4165UL, 0xc31a17a4UL, 0x963a275eUL,
     0xcb3bab6bUL, 0xf11f9d45UL, 0xabacfa58UL, 0x934be303UL,
     0x552030faUL, 0xf6ad766dUL, 0x9188cc76UL, 0x25f5024cUL,
@@ -628,7 +628,7 @@ static const unsigned long Td1[256] = {
     0x7139a801UL, 0xde080cb3UL, 0x9cd8b4e4UL, 0x906456c1UL,
     0x617bcb84UL, 0x70d532b6UL, 0x74486c5cUL, 0x42d0b857UL,
 };
-static const unsigned long Td2[256] = {
+static const ulong32 Td2[256] = {
     0xa75051f4UL, 0x65537e41UL, 0xa4c31a17UL, 0x5e963a27UL,
     0x6bcb3babUL, 0x45f11f9dUL, 0x58abacfaUL, 0x03934be3UL,
     0xfa552030UL, 0x6df6ad76UL, 0x769188ccUL, 0x4c25f502UL,
@@ -695,7 +695,7 @@ static const unsigned long Td2[256] = {
     0x017139a8UL, 0xb3de080cUL, 0xe49cd8b4UL, 0xc1906456UL,
     0x84617bcbUL, 0xb670d532UL, 0x5c74486cUL, 0x5742d0b8UL,
 };
-static const unsigned long Td3[256] = {
+static const ulong32 Td3[256] = {
     0xf4a75051UL, 0x4165537eUL, 0x17a4c31aUL, 0x275e963aUL,
     0xab6bcb3bUL, 0x9d45f11fUL, 0xfa58abacUL, 0xe303934bUL,
     0x30fa5520UL, 0x766df6adUL, 0xcc769188UL, 0x024c25f5UL,
@@ -761,7 +761,7 @@ static const unsigned long Td3[256] = {
     0xa8017139UL, 0x0cb3de08UL, 0xb4e49cd8UL, 0x56c19064UL,
     0xcb84617bUL, 0x32b670d5UL, 0x6c5c7448UL, 0xb85742d0UL,
 };
-static const unsigned long Td4[256] = {
+static const ulong32 Td4[256] = {
     0x52525252UL, 0x09090909UL, 0x6a6a6a6aUL, 0xd5d5d5d5UL,
     0x30303030UL, 0x36363636UL, 0xa5a5a5a5UL, 0x38383838UL,
     0xbfbfbfbfUL, 0x40404040UL, 0xa3a3a3a3UL, 0x9e9e9e9eUL,
@@ -827,14 +827,14 @@ static const unsigned long Td4[256] = {
     0xe1e1e1e1UL, 0x69696969UL, 0x14141414UL, 0x63636363UL,
     0x55555555UL, 0x21212121UL, 0x0c0c0c0cUL, 0x7d7d7d7dUL,
 };
-static const unsigned long rcon[] = {
+static const ulong32 rcon[] = {
     0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
     0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL,
     0x1B000000UL, 0x36000000UL, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
 };
 
 #ifndef SMALL_CODE
-static const unsigned long Tks0[] = {
+static const ulong32 Tks0[] = {
 0x00000000UL, 0x0e090d0bUL, 0x1c121a16UL, 0x121b171dUL, 0x3824342cUL, 0x362d3927UL, 0x24362e3aUL, 0x2a3f2331UL, 
 0x70486858UL, 0x7e416553UL, 0x6c5a724eUL, 0x62537f45UL, 0x486c5c74UL, 0x4665517fUL, 0x547e4662UL, 0x5a774b69UL, 
 0xe090d0b0UL, 0xee99ddbbUL, 0xfc82caa6UL, 0xf28bc7adUL, 0xd8b4e49cUL, 0xd6bde997UL, 0xc4a6fe8aUL, 0xcaaff381UL, 
@@ -869,7 +869,7 @@ static const unsigned long Tks0[] = {
 0xa779b492UL, 0xa970b999UL, 0xbb6bae84UL, 0xb562a38fUL, 0x9f5d80beUL, 0x91548db5UL, 0x834f9aa8UL, 0x8d4697a3UL
 };
 
-static const unsigned long Tks1[] = {
+static const ulong32 Tks1[] = {
 0x00000000UL, 0x0b0e090dUL, 0x161c121aUL, 0x1d121b17UL, 0x2c382434UL, 0x27362d39UL, 0x3a24362eUL, 0x312a3f23UL, 
 0x58704868UL, 0x537e4165UL, 0x4e6c5a72UL, 0x4562537fUL, 0x74486c5cUL, 0x7f466551UL, 0x62547e46UL, 0x695a774bUL, 
 0xb0e090d0UL, 0xbbee99ddUL, 0xa6fc82caUL, 0xadf28bc7UL, 0x9cd8b4e4UL, 0x97d6bde9UL, 0x8ac4a6feUL, 0x81caaff3UL, 
@@ -904,7 +904,7 @@ static const unsigned long Tks1[] = {
 0x92a779b4UL, 0x99a970b9UL, 0x84bb6baeUL, 0x8fb562a3UL, 0xbe9f5d80UL, 0xb591548dUL, 0xa8834f9aUL, 0xa38d4697UL
 };
 
-static const unsigned long Tks2[] = {
+static const ulong32 Tks2[] = {
 0x00000000UL, 0x0d0b0e09UL, 0x1a161c12UL, 0x171d121bUL, 0x342c3824UL, 0x3927362dUL, 0x2e3a2436UL, 0x23312a3fUL, 
 0x68587048UL, 0x65537e41UL, 0x724e6c5aUL, 0x7f456253UL, 0x5c74486cUL, 0x517f4665UL, 0x4662547eUL, 0x4b695a77UL, 
 0xd0b0e090UL, 0xddbbee99UL, 0xcaa6fc82UL, 0xc7adf28bUL, 0xe49cd8b4UL, 0xe997d6bdUL, 0xfe8ac4a6UL, 0xf381caafUL, 
@@ -939,7 +939,7 @@ static const unsigned long Tks2[] = {
 0xb492a779UL, 0xb999a970UL, 0xae84bb6bUL, 0xa38fb562UL, 0x80be9f5dUL, 0x8db59154UL, 0x9aa8834fUL, 0x97a38d46UL
 };
 
-static const unsigned long Tks3[] = {
+static const ulong32 Tks3[] = {
 0x00000000UL, 0x090d0b0eUL, 0x121a161cUL, 0x1b171d12UL, 0x24342c38UL, 0x2d392736UL, 0x362e3a24UL, 0x3f23312aUL, 
 0x48685870UL, 0x4165537eUL, 0x5a724e6cUL, 0x537f4562UL, 0x6c5c7448UL, 0x65517f46UL, 0x7e466254UL, 0x774b695aUL, 
 0x90d0b0e0UL, 0x99ddbbeeUL, 0x82caa6fcUL, 0x8bc7adf2UL, 0xb4e49cd8UL, 0xbde997d6UL, 0xa6fe8ac4UL, 0xaff381caUL, 
diff --git a/bits.c b/bits.c
index 60e3a43..19d2f0f 100644
--- a/bits.c
+++ b/bits.c
@@ -1,4 +1,6 @@
 /* portable way to get secure random bits to feed a PRNG */
+#include <fcntl.h>
+#include <unistd.h>
 #include "mycrypt.h"
 
 #ifdef DEVRANDOM
@@ -9,26 +11,20 @@ static unsigned long rng_nix(unsigned char *buf, unsigned long len,
 #ifdef NO_FILE
     return 0;
 #else
-    FILE *f;
+    int src;
     unsigned long x;
 #ifdef TRY_URANDOM_FIRST
-    f = fopen("/dev/urandom", "rb");
-    if (f == NULL)
+    src = open("/dev/urandom", O_RDONLY);
+    if (src == -1)
 #endif /* TRY_URANDOM_FIRST */
-       f = fopen("/dev/random", "rb");
+       src = open("/dev/random", O_RDONLY);
 
-    if (f == NULL) {
+    if (src == -1) {
        return 0;
     }
     
-    /* disable buffering */
-    if (setvbuf(f, NULL, _IONBF, 0) != 0) {
-       fclose(f);
-       return 0;
-    }   
- 
-    x = (unsigned long)fread(buf, 1, (size_t)len, f);
-    fclose(f);
+    x = (unsigned long)read(src, buf, (size_t)len);
+    close(src);
     return x;
 #endif /* NO_FILE */
 }
diff --git a/blowfish.c b/blowfish.c
index 72b2d8f..aba8a02 100644
--- a/blowfish.c
+++ b/blowfish.c
@@ -14,7 +14,7 @@ const struct _cipher_descriptor blowfish_desc =
     &blowfish_keysize
 };
 
-static const unsigned long ORIG_P[16 + 2] = {
+static const ulong32 ORIG_P[16 + 2] = {
         0x243F6A88UL, 0x85A308D3UL, 0x13198A2EUL, 0x03707344UL,
         0xA4093822UL, 0x299F31D0UL, 0x082EFA98UL, 0xEC4E6C89UL,
         0x452821E6UL, 0x38D01377UL, 0xBE5466CFUL, 0x34E90C6CUL,
@@ -22,7 +22,7 @@ static const unsigned long ORIG_P[16 + 2] = {
         0x9216D5D9UL, 0x8979FB1BUL
 };
 
-static const unsigned long ORIG_S[4][256] = {
+static const ulong32 ORIG_S[4][256] = {
     {   0xD1310BA6UL, 0x98DFB5ACUL, 0x2FFD72DBUL, 0xD01ADFB7UL,
         0xB8E1AFEDUL, 0x6A267E96UL, 0xBA7C9045UL, 0xF12C7F99UL,
         0x24A19947UL, 0xB3916CF7UL, 0x0801F2E2UL, 0x858EFC16UL,
@@ -284,7 +284,7 @@ static const unsigned long ORIG_S[4][256] = {
 int blowfish_setup(const unsigned char *key, int keylen, int num_rounds,
                    symmetric_key *skey)
 {
-   unsigned long x, y, z, A;
+   ulong32 x, y, z, A;
    unsigned char B[8];
 
    _ARGCHK(key != NULL);
@@ -304,7 +304,7 @@ int blowfish_setup(const unsigned char *key, int keylen, int num_rounds,
    for (x = y = 0; x < 18; x++) {
        A = 0;
        for (z = 0; z < 4; z++) {
-           A = (A << 8) | ((unsigned long)key[y++ % keylen]);
+           A = (A << 8) | ((ulong32)key[y++ % keylen]);
        }
        skey->blowfish.K[x] = ORIG_P[x] ^ A;
    }
@@ -362,10 +362,10 @@ static void _blowfish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, sy
 void blowfish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 #endif
 {
-   unsigned long L, R;
+   ulong32 L, R;
    int r;
 #if !defined(TWOFISH_SMALL) && !defined(__GNUC__)
-    unsigned long *S1, *S2, *S3, *S4;
+    ulong32 *S1, *S2, *S3, *S4;
 #endif    
 
     _ARGCHK(pt != NULL);
@@ -404,7 +404,7 @@ void blowfish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_
 void blowfish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
     _blowfish_ecb_encrypt(pt, ct, key);
-    burn_stack(sizeof(unsigned long) * 2 + sizeof(int));
+    burn_stack(sizeof(ulong32) * 2 + sizeof(int));
 }
 #endif
 
@@ -414,10 +414,10 @@ static void _blowfish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, sy
 void blowfish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 #endif
 {
-   unsigned long L, R;
+   ulong32 L, R;
    int r;
 #if !defined(TWOFISH_SMALL) && !defined(__GNUC__)
-    unsigned long *S1, *S2, *S3, *S4;
+    ulong32 *S1, *S2, *S3, *S4;
 #endif    
 
     _ARGCHK(pt != NULL);
@@ -456,7 +456,7 @@ void blowfish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_
 void blowfish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
     _blowfish_ecb_decrypt(ct, pt, key);
-    burn_stack(sizeof(unsigned long) * 2 + sizeof(int));
+    burn_stack(sizeof(ulong32) * 2 + sizeof(int));
 }
 #endif
 
diff --git a/cast5.c b/cast5.c
index c85cd68..2d79b9d 100644
--- a/cast5.c
+++ b/cast5.c
@@ -14,7 +14,7 @@ const struct _cipher_descriptor cast5_desc = {
    &cast5_keysize
 };
 
-static const unsigned long S1[256] = {
+static const ulong32 S1[256] = {
 0x30fb40d4UL, 0x9fa0ff0bUL, 0x6beccd2fUL, 0x3f258c7aUL, 0x1e213f2fUL, 0x9c004dd3UL, 
 0x6003e540UL, 0xcf9fc949UL, 0xbfd4af27UL, 0x88bbbdb5UL, 0xe2034090UL, 0x98d09675UL, 
 0x6e63a0e0UL, 0x15c361d2UL, 0xc2e7661dUL, 0x22d4ff8eUL, 0x28683b6fUL, 0xc07fd059UL, 
@@ -59,7 +59,7 @@ static const unsigned long S1[256] = {
 0xb141ab08UL, 0x7cca89b9UL, 0x1a69e783UL, 0x02cc4843UL, 0xa2f7c579UL, 0x429ef47dUL, 
 0x427b169cUL, 0x5ac9f049UL, 0xdd8f0f00UL, 0x5c8165bfUL};
 
-static const unsigned long S2[256] = {
+static const ulong32 S2[256] = {
 0x1f201094UL, 0xef0ba75bUL, 0x69e3cf7eUL, 0x393f4380UL, 0xfe61cf7aUL, 0xeec5207aUL, 
 0x55889c94UL, 0x72fc0651UL, 0xada7ef79UL, 0x4e1d7235UL, 0xd55a63ceUL, 0xde0436baUL, 
 0x99c430efUL, 0x5f0c0794UL, 0x18dcdb7dUL, 0xa1d6eff3UL, 0xa0b52f7bUL, 0x59e83605UL, 
@@ -104,7 +104,7 @@ static const unsigned long S2[256] = {
 0x5c038323UL, 0x3e5d3bb9UL, 0x43d79572UL, 0x7e6dd07cUL, 0x06dfdf1eUL, 0x6c6cc4efUL, 
 0x7160a539UL, 0x73bfbe70UL, 0x83877605UL, 0x4523ecf1UL};
 
-static const unsigned long S3[256] = {
+static const ulong32 S3[256] = {
 0x8defc240UL, 0x25fa5d9fUL, 0xeb903dbfUL, 0xe810c907UL, 0x47607fffUL, 0x369fe44bUL, 
 0x8c1fc644UL, 0xaececa90UL, 0xbeb1f9bfUL, 0xeefbcaeaUL, 0xe8cf1950UL, 0x51df07aeUL, 
 0x920e8806UL, 0xf0ad0548UL, 0xe13c8d83UL, 0x927010d5UL, 0x11107d9fUL, 0x07647db9UL, 
@@ -149,7 +149,7 @@ static const unsigned long S3[256] = {
 0x52bce688UL, 0x1b03588aUL, 0xf7baefd5UL, 0x4142ed9cUL, 0xa4315c11UL, 0x83323ec5UL, 
 0xdfef4636UL, 0xa133c501UL, 0xe9d3531cUL, 0xee353783UL};
 
-static const unsigned long S4[256] = {
+static const ulong32 S4[256] = {
 0x9db30420UL, 0x1fb6e9deUL, 0xa7be7befUL, 0xd273a298UL, 0x4a4f7bdbUL, 0x64ad8c57UL, 
 0x85510443UL, 0xfa020ed1UL, 0x7e287affUL, 0xe60fb663UL, 0x095f35a1UL, 0x79ebf120UL, 
 0xfd059d43UL, 0x6497b7b1UL, 0xf3641f63UL, 0x241e4adfUL, 0x28147f5fUL, 0x4fa2b8cdUL, 
@@ -194,7 +194,7 @@ static const unsigned long S4[256] = {
 0xb657c34dUL, 0x4edfd282UL, 0x7ae5290cUL, 0x3cb9536bUL, 0x851e20feUL, 0x9833557eUL, 
 0x13ecf0b0UL, 0xd3ffb372UL, 0x3f85c5c1UL, 0x0aef7ed2UL};
 
-static const unsigned long S5[256] = {
+static const ulong32 S5[256] = {
 0x7ec90c04UL, 0x2c6e74b9UL, 0x9b0e66dfUL, 0xa6337911UL, 0xb86a7fffUL, 0x1dd358f5UL, 
 0x44dd9d44UL, 0x1731167fUL, 0x08fbf1faUL, 0xe7f511ccUL, 0xd2051b00UL, 0x735aba00UL, 
 0x2ab722d8UL, 0x386381cbUL, 0xacf6243aUL, 0x69befd7aUL, 0xe6a2e77fUL, 0xf0c720cdUL, 
@@ -239,7 +239,7 @@ static const unsigned long S5[256] = {
 0x34010718UL, 0xbb30cab8UL, 0xe822fe15UL, 0x88570983UL, 0x750e6249UL, 0xda627e55UL, 
 0x5e76ffa8UL, 0xb1534546UL, 0x6d47de08UL, 0xefe9e7d4UL};
 
-static const unsigned long S6[256] = {
+static const ulong32 S6[256] = {
 0xf6fa8f9dUL, 0x2cac6ce1UL, 0x4ca34867UL, 0xe2337f7cUL, 0x95db08e7UL, 0x016843b4UL, 
 0xeced5cbcUL, 0x325553acUL, 0xbf9f0960UL, 0xdfa1e2edUL, 0x83f0579dUL, 0x63ed86b9UL, 
 0x1ab6a6b8UL, 0xde5ebe39UL, 0xf38ff732UL, 0x8989b138UL, 0x33f14961UL, 0xc01937bdUL, 
@@ -284,7 +284,7 @@ static const unsigned long S6[256] = {
 0xb0e93524UL, 0xbebb8fbdUL, 0xa2d762cfUL, 0x49c92f54UL, 0x38b5f331UL, 0x7128a454UL, 
 0x48392905UL, 0xa65b1db8UL, 0x851c97bdUL, 0xd675cf2fUL};
 
-static const unsigned long S7[256] = {
+static const ulong32 S7[256] = {
 0x85e04019UL, 0x332bf567UL, 0x662dbfffUL, 0xcfc65693UL, 0x2a8d7f6fUL, 0xab9bc912UL, 
 0xde6008a1UL, 0x2028da1fUL, 0x0227bce7UL, 0x4d642916UL, 0x18fac300UL, 0x50f18b82UL, 
 0x2cb2cb11UL, 0xb232e75cUL, 0x4b3695f2UL, 0xb28707deUL, 0xa05fbcf6UL, 0xcd4181e9UL, 
@@ -329,7 +329,7 @@ static const unsigned long S7[256] = {
 0xc3c0bdaeUL, 0x4958c24cUL, 0x518f36b2UL, 0x84b1d370UL, 0x0fedce83UL, 0x878ddadaUL, 
 0xf2a279c7UL, 0x94e01be8UL, 0x90716f4bUL, 0x954b8aa3UL};
 
-static const unsigned long S8[256] = {
+static const ulong32 S8[256] = {
 0xe216300dUL, 0xbbddfffcUL, 0xa7ebdabdUL, 0x35648095UL, 0x7789f8b7UL, 0xe6c1121bUL, 
 0x0e241600UL, 0x052ce8b5UL, 0x11a9cfb0UL, 0xe5952f11UL, 0xece7990aUL, 0x9386d174UL, 
 0x2a42931cUL, 0x76e38111UL, 0xb12def3aUL, 0x37ddddfcUL, 0xde9adeb1UL, 0x0a0cc32cUL, 
@@ -375,11 +375,15 @@ static const unsigned long S8[256] = {
 0x50b2ad80UL, 0xeaee6801UL, 0x8db2a283UL, 0xea8bf59eUL};
 
 /* returns the i'th byte of a variable */
-#define GB(x, i) (((x[(15-i)>>2])>>(unsigned)(8*((15-i)&3)))&255)
+#ifdef _MSC_VER
+   #define GB(x, i) ((unsigned char)((x[(15-i)>>2])>>(unsigned)(8*((15-i)&3))))
+#else   
+   #define GB(x, i) (((x[(15-i)>>2])>>(unsigned)(8*((15-i)&3)))&255)
+#endif   
 
 int cast5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 {
-   unsigned long x[4], z[4];
+   ulong32 x[4], z[4];
    unsigned char buf[16];
    int y, i;
 
@@ -464,25 +468,25 @@ int cast5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_
    #define INLINE 
 #endif   
    
-INLINE static unsigned long FI(unsigned long R, unsigned long Km, unsigned long Kr)
+INLINE static ulong32 FI(ulong32 R, ulong32 Km, ulong32 Kr)
 {
-   unsigned long I;
+   ulong32 I;
    I = (Km + R);
    I = ROL(I, Kr);
    return ((S1[byte(I, 3)] ^ S2[byte(I,2)]) - S3[byte(I,1)]) + S4[byte(I,0)];
 }
    
-INLINE static unsigned long FII(unsigned long R, unsigned long Km, unsigned long Kr)
+INLINE static ulong32 FII(ulong32 R, ulong32 Km, ulong32 Kr)
 {
-   unsigned long I;
+   ulong32 I;
    I = (Km ^ R);
    I = ROL(I, Kr);
    return ((S1[byte(I, 3)] - S2[byte(I,2)]) + S3[byte(I,1)]) ^ S4[byte(I,0)];
 }
 
-INLINE static unsigned long FIII(unsigned long R, unsigned long Km, unsigned long Kr)
+INLINE static ulong32 FIII(ulong32 R, ulong32 Km, ulong32 Kr)
 {
-   unsigned long I;
+   ulong32 I;
    I = (Km - R);
    I = ROL(I, Kr);
    return ((S1[byte(I, 3)] + S2[byte(I,2)]) ^ S3[byte(I,1)]) - S4[byte(I,0)];
@@ -490,7 +494,7 @@ INLINE static unsigned long FIII(unsigned long R, unsigned long Km, unsigned lon
 
 void cast5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
-   unsigned long R, L;
+   ulong32 R, L;
 
    _ARGCHK(pt != NULL);
    _ARGCHK(ct != NULL);
@@ -523,7 +527,7 @@ void cast5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key
 
 void cast5_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
-   unsigned long R, L;
+   ulong32 R, L;
 
    _ARGCHK(pt != NULL);
    _ARGCHK(ct != NULL);
diff --git a/changes b/changes
index 5acc5f5..5f5e4a2 100644
--- a/changes
+++ b/changes
@@ -1,3 +1,13 @@
+Jul 10th, 2003
+v0.88  -- Sped up CAST5 key schedule for MSVC
+       -- added "ulong32" which allows people on 64-bit platforms to force the 32-bit tables in
+          ciphers like blowfish and AES to be 32-bits.  E.g. when unsigned long is 64-bits.
+       -- Optimized the SAFER-SK64, SAFER-SK128, SAFER+, RC5 and RC6 key schedule [big time!]
+       -- Optimized SHA-1 and SHA-256 quite a bit too.
+       -- Fixed up the makefile to use -fomit-frame-pointer more liberally
+       -- Added tv_gen program which makes test vectors for ciphers/hashes
+       -- Merged in LibTomMath v0.22
+       
 Jun 19th, 2003
 v0.87  -- Many MSVC optimizations to the code base
        -- Improved the AES and Twofish key schedule [faster, more constant time]
diff --git a/crypt.out b/crypt.out
index 7471011..7a3feac 100644
--- a/crypt.out
+++ b/crypt.out
@@ -1,76 +1,76 @@
-\BOOKMARK [0][-]{chapter.1}{Introduction}{}
-\BOOKMARK [1][-]{section.1.1}{What is the LibTomCrypt?}{chapter.1}
-\BOOKMARK [2][-]{subsection.1.1.1}{What the library IS for?}{section.1.1}
-\BOOKMARK [2][-]{subsection.1.1.2}{What the library IS NOT for?}{section.1.1}
-\BOOKMARK [1][-]{section.1.2}{Why did I write it?}{chapter.1}
-\BOOKMARK [2][-]{subsection.1.2.1}{Modular}{section.1.2}
-\BOOKMARK [1][-]{section.1.3}{License}{chapter.1}
-\BOOKMARK [1][-]{section.1.4}{Patent Disclosure}{chapter.1}
-\BOOKMARK [1][-]{section.1.5}{Building the library}{chapter.1}
-\BOOKMARK [1][-]{section.1.6}{Building against the library}{chapter.1}
-\BOOKMARK [1][-]{section.1.7}{Thanks}{chapter.1}
-\BOOKMARK [0][-]{chapter.2}{The Application Programming Interface \(API\)}{}
-\BOOKMARK [1][-]{section.2.1}{Introduction}{chapter.2}
-\BOOKMARK [1][-]{section.2.2}{Macros}{chapter.2}
-\BOOKMARK [1][-]{section.2.3}{Functions with Variable Length Output}{chapter.2}
-\BOOKMARK [1][-]{section.2.4}{Functions that need a PRNG}{chapter.2}
-\BOOKMARK [1][-]{section.2.5}{Functions that use Arrays of Octets}{chapter.2}
-\BOOKMARK [0][-]{chapter.3}{Symmetric Block Ciphers}{}
-\BOOKMARK [1][-]{section.3.1}{Core Functions}{chapter.3}
-\BOOKMARK [1][-]{section.3.2}{Key Sizes and Number of Rounds}{chapter.3}
-\BOOKMARK [1][-]{section.3.3}{The Cipher Descriptors}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.3.1}{Notes}{section.3.3}
-\BOOKMARK [1][-]{section.3.4}{Symmetric Modes of Operations}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.4.1}{Background}{section.3.4}
-\BOOKMARK [2][-]{subsection.3.4.2}{Choice of Mode}{section.3.4}
-\BOOKMARK [2][-]{subsection.3.4.3}{Implementation}{section.3.4}
-\BOOKMARK [0][-]{chapter.4}{One-Way Cryptographic Hash Functions}{}
-\BOOKMARK [1][-]{section.4.1}{Core Functions}{chapter.4}
-\BOOKMARK [1][-]{section.4.2}{Hash Descriptors}{chapter.4}
-\BOOKMARK [2][-]{subsection.4.2.1}{Notice}{section.4.2}
-\BOOKMARK [1][-]{section.4.3}{Hash based Message Authenication Codes}{chapter.4}
-\BOOKMARK [0][-]{chapter.5}{Pseudo-Random Number Generators}{}
-\BOOKMARK [1][-]{section.5.1}{Core Functions}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.1.1}{Remarks}{section.5.1}
-\BOOKMARK [2][-]{subsection.5.1.2}{Example}{section.5.1}
-\BOOKMARK [1][-]{section.5.2}{PRNG Descriptors}{chapter.5}
-\BOOKMARK [1][-]{section.5.3}{The Secure RNG}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.3.1}{The Secure PRNG Interface}{section.5.3}
-\BOOKMARK [0][-]{chapter.6}{RSA Routines}{}
-\BOOKMARK [1][-]{section.6.1}{Background}{chapter.6}
-\BOOKMARK [1][-]{section.6.2}{Core Functions}{chapter.6}
-\BOOKMARK [1][-]{section.6.3}{Packet Routines}{chapter.6}
-\BOOKMARK [1][-]{section.6.4}{Remarks}{chapter.6}
-\BOOKMARK [0][-]{chapter.7}{Diffie-Hellman Key Exchange}{}
-\BOOKMARK [1][-]{section.7.1}{Background}{chapter.7}
-\BOOKMARK [1][-]{section.7.2}{Core Functions}{chapter.7}
-\BOOKMARK [2][-]{subsection.7.2.1}{Remarks on Usage}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.2}{Remarks on The Snippet}{section.7.2}
-\BOOKMARK [1][-]{section.7.3}{Other Diffie-Hellman Functions}{chapter.7}
-\BOOKMARK [1][-]{section.7.4}{DH Packet}{chapter.7}
-\BOOKMARK [0][-]{chapter.8}{Elliptic Curve Cryptography}{}
-\BOOKMARK [1][-]{section.8.1}{Background}{chapter.8}
-\BOOKMARK [1][-]{section.8.2}{Core Functions}{chapter.8}
-\BOOKMARK [1][-]{section.8.3}{ECC Packet}{chapter.8}
-\BOOKMARK [1][-]{section.8.4}{ECC Keysizes}{chapter.8}
-\BOOKMARK [0][-]{chapter.9}{Public Keyrings}{}
-\BOOKMARK [1][-]{section.9.1}{Introduction}{chapter.9}
-\BOOKMARK [1][-]{section.9.2}{The Keyring API}{chapter.9}
-\BOOKMARK [0][-]{chapter.10}{GF\(2w\) Math Routines}{}
-\BOOKMARK [0][-]{chapter.11}{Miscellaneous}{}
-\BOOKMARK [1][-]{section.11.1}{Base64 Encoding and Decoding}{chapter.11}
-\BOOKMARK [1][-]{section.11.2}{The Multiple Precision Integer Library \(MPI\)}{chapter.11}
-\BOOKMARK [2][-]{subsection.11.2.1}{Binary Forms of ``mp\137int'' Variables}{section.11.2}
-\BOOKMARK [2][-]{subsection.11.2.2}{Primality Testing}{section.11.2}
-\BOOKMARK [0][-]{chapter.12}{Programming Guidelines}{}
-\BOOKMARK [1][-]{section.12.1}{Secure Pseudo Random Number Generators}{chapter.12}
-\BOOKMARK [1][-]{section.12.2}{Preventing Trivial Errors}{chapter.12}
-\BOOKMARK [1][-]{section.12.3}{Registering Your Algorithms}{chapter.12}
-\BOOKMARK [1][-]{section.12.4}{Key Sizes}{chapter.12}
-\BOOKMARK [2][-]{subsection.12.4.1}{Symmetric Ciphers}{section.12.4}
-\BOOKMARK [2][-]{subsection.12.4.2}{Assymetric Ciphers}{section.12.4}
-\BOOKMARK [1][-]{section.12.5}{Thread Safety}{chapter.12}
-\BOOKMARK [0][-]{chapter.13}{Configuring the Library}{}
-\BOOKMARK [1][-]{section.13.1}{Introduction}{chapter.13}
-\BOOKMARK [1][-]{section.13.2}{mycrypt\137cfg.h}{chapter.13}
-\BOOKMARK [1][-]{section.13.3}{The Configure Script}{chapter.13}
+\BOOKMARK [0][-]{chapter.1}{Introduction}{}
+\BOOKMARK [1][-]{section.1.1}{What is the LibTomCrypt?}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.1.1}{What the library IS for?}{section.1.1}
+\BOOKMARK [2][-]{subsection.1.1.2}{What the library IS NOT for?}{section.1.1}
+\BOOKMARK [1][-]{section.1.2}{Why did I write it?}{chapter.1}
+\BOOKMARK [2][-]{subsection.1.2.1}{Modular}{section.1.2}
+\BOOKMARK [1][-]{section.1.3}{License}{chapter.1}
+\BOOKMARK [1][-]{section.1.4}{Patent Disclosure}{chapter.1}
+\BOOKMARK [1][-]{section.1.5}{Building the library}{chapter.1}
+\BOOKMARK [1][-]{section.1.6}{Building against the library}{chapter.1}
+\BOOKMARK [1][-]{section.1.7}{Thanks}{chapter.1}
+\BOOKMARK [0][-]{chapter.2}{The Application Programming Interface \(API\)}{}
+\BOOKMARK [1][-]{section.2.1}{Introduction}{chapter.2}
+\BOOKMARK [1][-]{section.2.2}{Macros}{chapter.2}
+\BOOKMARK [1][-]{section.2.3}{Functions with Variable Length Output}{chapter.2}
+\BOOKMARK [1][-]{section.2.4}{Functions that need a PRNG}{chapter.2}
+\BOOKMARK [1][-]{section.2.5}{Functions that use Arrays of Octets}{chapter.2}
+\BOOKMARK [0][-]{chapter.3}{Symmetric Block Ciphers}{}
+\BOOKMARK [1][-]{section.3.1}{Core Functions}{chapter.3}
+\BOOKMARK [1][-]{section.3.2}{Key Sizes and Number of Rounds}{chapter.3}
+\BOOKMARK [1][-]{section.3.3}{The Cipher Descriptors}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.3.1}{Notes}{section.3.3}
+\BOOKMARK [1][-]{section.3.4}{Symmetric Modes of Operations}{chapter.3}
+\BOOKMARK [2][-]{subsection.3.4.1}{Background}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.2}{Choice of Mode}{section.3.4}
+\BOOKMARK [2][-]{subsection.3.4.3}{Implementation}{section.3.4}
+\BOOKMARK [0][-]{chapter.4}{One-Way Cryptographic Hash Functions}{}
+\BOOKMARK [1][-]{section.4.1}{Core Functions}{chapter.4}
+\BOOKMARK [1][-]{section.4.2}{Hash Descriptors}{chapter.4}
+\BOOKMARK [2][-]{subsection.4.2.1}{Notice}{section.4.2}
+\BOOKMARK [1][-]{section.4.3}{Hash based Message Authenication Codes}{chapter.4}
+\BOOKMARK [0][-]{chapter.5}{Pseudo-Random Number Generators}{}
+\BOOKMARK [1][-]{section.5.1}{Core Functions}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.1.1}{Remarks}{section.5.1}
+\BOOKMARK [2][-]{subsection.5.1.2}{Example}{section.5.1}
+\BOOKMARK [1][-]{section.5.2}{PRNG Descriptors}{chapter.5}
+\BOOKMARK [1][-]{section.5.3}{The Secure RNG}{chapter.5}
+\BOOKMARK [2][-]{subsection.5.3.1}{The Secure PRNG Interface}{section.5.3}
+\BOOKMARK [0][-]{chapter.6}{RSA Routines}{}
+\BOOKMARK [1][-]{section.6.1}{Background}{chapter.6}
+\BOOKMARK [1][-]{section.6.2}{Core Functions}{chapter.6}
+\BOOKMARK [1][-]{section.6.3}{Packet Routines}{chapter.6}
+\BOOKMARK [1][-]{section.6.4}{Remarks}{chapter.6}
+\BOOKMARK [0][-]{chapter.7}{Diffie-Hellman Key Exchange}{}
+\BOOKMARK [1][-]{section.7.1}{Background}{chapter.7}
+\BOOKMARK [1][-]{section.7.2}{Core Functions}{chapter.7}
+\BOOKMARK [2][-]{subsection.7.2.1}{Remarks on Usage}{section.7.2}
+\BOOKMARK [2][-]{subsection.7.2.2}{Remarks on The Snippet}{section.7.2}
+\BOOKMARK [1][-]{section.7.3}{Other Diffie-Hellman Functions}{chapter.7}
+\BOOKMARK [1][-]{section.7.4}{DH Packet}{chapter.7}
+\BOOKMARK [0][-]{chapter.8}{Elliptic Curve Cryptography}{}
+\BOOKMARK [1][-]{section.8.1}{Background}{chapter.8}
+\BOOKMARK [1][-]{section.8.2}{Core Functions}{chapter.8}
+\BOOKMARK [1][-]{section.8.3}{ECC Packet}{chapter.8}
+\BOOKMARK [1][-]{section.8.4}{ECC Keysizes}{chapter.8}
+\BOOKMARK [0][-]{chapter.9}{Public Keyrings}{}
+\BOOKMARK [1][-]{section.9.1}{Introduction}{chapter.9}
+\BOOKMARK [1][-]{section.9.2}{The Keyring API}{chapter.9}
+\BOOKMARK [0][-]{chapter.10}{GF\(2w\) Math Routines}{}
+\BOOKMARK [0][-]{chapter.11}{Miscellaneous}{}
+\BOOKMARK [1][-]{section.11.1}{Base64 Encoding and Decoding}{chapter.11}
+\BOOKMARK [1][-]{section.11.2}{The Multiple Precision Integer Library \(MPI\)}{chapter.11}
+\BOOKMARK [2][-]{subsection.11.2.1}{Binary Forms of ``mp\137int'' Variables}{section.11.2}
+\BOOKMARK [2][-]{subsection.11.2.2}{Primality Testing}{section.11.2}
+\BOOKMARK [0][-]{chapter.12}{Programming Guidelines}{}
+\BOOKMARK [1][-]{section.12.1}{Secure Pseudo Random Number Generators}{chapter.12}
+\BOOKMARK [1][-]{section.12.2}{Preventing Trivial Errors}{chapter.12}
+\BOOKMARK [1][-]{section.12.3}{Registering Your Algorithms}{chapter.12}
+\BOOKMARK [1][-]{section.12.4}{Key Sizes}{chapter.12}
+\BOOKMARK [2][-]{subsection.12.4.1}{Symmetric Ciphers}{section.12.4}
+\BOOKMARK [2][-]{subsection.12.4.2}{Assymetric Ciphers}{section.12.4}
+\BOOKMARK [1][-]{section.12.5}{Thread Safety}{chapter.12}
+\BOOKMARK [0][-]{chapter.13}{Configuring the Library}{}
+\BOOKMARK [1][-]{section.13.1}{Introduction}{chapter.13}
+\BOOKMARK [1][-]{section.13.2}{mycrypt\137cfg.h}{chapter.13}
+\BOOKMARK [1][-]{section.13.3}{The Configure Script}{chapter.13}
diff --git a/crypt.pdf b/crypt.pdf
index f5133ba..d60aca0 100644
Binary files a/crypt.pdf and b/crypt.pdf differ
diff --git a/crypt.tex b/crypt.tex
index d040e73..0cfeff3 100644
--- a/crypt.tex
+++ b/crypt.tex
@@ -47,7 +47,7 @@
 \def\gap{\vspace{0.5ex}}
 \makeindex
 \begin{document}
-\title{A Tiny Crypto Library, \\ LibTomCrypt \\ Version 0.87}
+\title{A Tiny Crypto Library, \\ LibTomCrypt \\ Version 0.88}
 \author{Tom St Denis \\
 Algonquin College \\
 \\
diff --git a/demos/test.c b/demos/test.c
index 420c8fe..cc6ff98 100644
--- a/demos/test.c
+++ b/demos/test.c
@@ -1261,7 +1261,7 @@ gf_tests (void)
 void
 test_prime (void)
 {
-  unsigned char buf[1024];
+  char buf[1024];
   mp_int  a;
   int     x;
 
diff --git a/demos/tv_gen.c b/demos/tv_gen.c
new file mode 100644
index 0000000..5230559
--- /dev/null
+++ b/demos/tv_gen.c
@@ -0,0 +1,167 @@
+#include <mycrypt.h>
+
+void reg_algs(void)
+{
+#ifdef RIJNDAEL
+  register_cipher (&aes_desc);
+#endif
+#ifdef BLOWFISH
+  register_cipher (&blowfish_desc);
+#endif
+#ifdef XTEA
+  register_cipher (&xtea_desc);
+#endif
+#ifdef RC5
+  register_cipher (&rc5_desc);
+#endif
+#ifdef RC6
+  register_cipher (&rc6_desc);
+#endif
+#ifdef SAFERP
+  register_cipher (&saferp_desc);
+#endif
+#ifdef TWOFISH
+  register_cipher (&twofish_desc);
+#endif
+#ifdef SAFER
+  register_cipher (&safer_k64_desc);
+  register_cipher (&safer_sk64_desc);
+  register_cipher (&safer_k128_desc);
+  register_cipher (&safer_sk128_desc);
+#endif
+#ifdef RC2
+  register_cipher (&rc2_desc);
+#endif
+#ifdef DES
+  register_cipher (&des_desc);
+  register_cipher (&des3_desc);
+#endif
+#ifdef CAST5
+  register_cipher (&cast5_desc);
+#endif
+#ifdef NOEKEON
+  register_cipher (&noekeon_desc);
+#endif
+
+#ifdef TIGER
+  register_hash (&tiger_desc);
+#endif
+#ifdef MD2
+  register_hash (&md2_desc);
+#endif
+#ifdef MD4
+  register_hash (&md4_desc);
+#endif
+#ifdef MD5
+  register_hash (&md5_desc);
+#endif
+#ifdef SHA1
+  register_hash (&sha1_desc);
+#endif
+#ifdef SHA256
+  register_hash (&sha256_desc);
+#endif
+#ifdef SHA384
+  register_hash (&sha384_desc);
+#endif
+#ifdef SHA512
+  register_hash (&sha512_desc);
+#endif
+}
+
+void hash_gen(void)
+{
+   unsigned char md[MAXBLOCKSIZE], buf[MAXBLOCKSIZE*2+2];
+   unsigned long outlen, x, y, z;
+   FILE *out;
+   
+   out = fopen("hash_tv.txt", "w");
+   
+   fprintf(out, "Hash Test Vectors:\n\nThese are the hashes of nn bytes '00 01 02 03 .. (nn-1)'\n\n");
+   for (x = 0; hash_descriptor[x].name != NULL; x++) {
+      fprintf(out, "Hash: %s\n", hash_descriptor[x].name);
+      
+      for (y = 0; y <= (hash_descriptor[x].blocksize * 2); y++) {
+         for (z = 0; z < y; z++) {
+            buf[z] = (unsigned char)z;
+         }
+         outlen = sizeof(md);
+         hash_memory(x, buf, y, md, &outlen);
+         fprintf(out, "%3lu: ", y);
+         for (z = 0; z < outlen; z++) {
+            fprintf(out, "%02X", md[z]);
+         }
+         fprintf(out, "\n");
+      }
+      fprintf(out, "\n");
+   }
+   fclose(out);
+}
+
+void cipher_gen(void)
+{
+   unsigned char key[MAXBLOCKSIZE], pt[MAXBLOCKSIZE];
+   unsigned long x, y, z, w;
+   int kl, lastkl;
+   FILE *out;
+   symmetric_key skey;
+   
+   out = fopen("cipher_tv.txt", "w");
+   
+   fprintf(out, "Cipher Test Vectors\n\nThese are test encryptions with key of nn bytes '00 01 02 03 .. (nn-1)' and original PT of the same style.\n\n");
+   for (x = 0; cipher_descriptor[x].name != NULL; x++) {
+      fprintf(out, "Cipher: %s\n", cipher_descriptor[x].name);
+      
+      /* three modes, smallest, medium, large keys */
+      lastkl = 10000;
+      for (y = 0; y < 3; y++) {
+         switch (y) {
+            case 0: kl = cipher_descriptor[x].min_key_length; break;
+            case 1: kl = (cipher_descriptor[x].min_key_length + cipher_descriptor[x].max_key_length)/2; break;
+            case 2: kl = cipher_descriptor[x].max_key_length; break;
+         }
+         cipher_descriptor[x].keysize(&kl);
+         if (kl == lastkl) break;
+         lastkl = kl;
+         fprintf(out, "Key Size: %d bytes\n", kl);
+
+         for (z = 0; (int)z < kl; z++) {
+             key[z] = (unsigned char)z;
+         }
+         cipher_descriptor[x].setup(key, kl, 0, &skey);
+         
+         for (z = 0; (int)z < cipher_descriptor[x].block_length; z++) {
+            pt[z] = (unsigned char)z;
+         }
+         for (w = 0; w < 25; w++) {
+             cipher_descriptor[x].ecb_encrypt(pt, pt, &skey);
+             fprintf(out, "%2lu: ", w);
+             for (z = 0; (int)z < cipher_descriptor[x].block_length; z++) {
+                fprintf(out, "%02X", pt[z]);
+             }
+             fprintf(out, "\n");
+         }
+         fprintf(out, "\n");
+     }
+     fprintf(out, "\n");
+  }
+  fclose(out);
+}  
+   
+
+int main(void)
+{
+   reg_algs();
+   hash_gen();
+   cipher_gen();
+   
+   return 0;
+}
+
+
+         
+      
+      
+      
+    
+   
diff --git a/demos/x86_prof.c b/demos/x86_prof.c
index 87d6caf..72a2eb6 100644
--- a/demos/x86_prof.c
+++ b/demos/x86_prof.c
@@ -1,291 +1,283 @@
-#include <mycrypt.h>
-
-#define KTIMES  25
-#define TIMES   10000
-
-/* RDTSC from Scott Duplichan */
-static ulong64 rdtsc (void)
-   {
-   #if defined __GNUC__
-      #ifdef i386
-         ulong64 a;
-         asm volatile("rdtsc ":"=A" (a));
-         return a;
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
-
-   // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
-
-ulong64 timer, skew = 0;
-
-void t_start(void)
-{
-   timer = rdtsc();
-}
-
-ulong64 t_read(void)
-{
-   return rdtsc() - timer;
-}
-
-void init_timer(void)
-{
-   ulong64 c1, c2, t1, t2, t3;
-   unsigned long y1;
-   
-   c1 = c2 = (ulong64)-1;
-   for (y1 = 0; y1 < TIMES*100; y1++) {
-      t_start();
-      t1 = t_read();
-      t3 = t_read();
-      t2 = t_read() - t1;
-      
-      c1 = (c1 > t1) ? t1 : c1;
-      c2 = (c2 > t2) ? t2 : c2;
-   }
-   skew = c2 - c1;
-   printf("Clock Skew: %lu\n", (unsigned long)skew);
-}  
-
-void reg_algs(void)
-{
-#ifdef RIJNDAEL
-  register_cipher (&aes_desc);
-#endif
-#ifdef BLOWFISH
-  register_cipher (&blowfish_desc);
-#endif
-#ifdef XTEA
-  register_cipher (&xtea_desc);
-#endif
-#ifdef RC5
-  register_cipher (&rc5_desc);
-#endif
-#ifdef RC6
-  register_cipher (&rc6_desc);
-#endif
-#ifdef SAFERP
-  register_cipher (&saferp_desc);
-#endif
-#ifdef TWOFISH
-  register_cipher (&twofish_desc);
-#endif
-#ifdef SAFER
-  register_cipher (&safer_k64_desc);
-  register_cipher (&safer_sk64_desc);
-  register_cipher (&safer_k128_desc);
-  register_cipher (&safer_sk128_desc);
-#endif
-#ifdef RC2
-  register_cipher (&rc2_desc);
-#endif
-#ifdef DES
-  register_cipher (&des_desc);
-  register_cipher (&des3_desc);
-#endif
-#ifdef CAST5
-  register_cipher (&cast5_desc);
-#endif
-#ifdef NOEKEON
-  register_cipher (&noekeon_desc);
-#endif
-
-#ifdef TIGER
-  register_hash (&tiger_desc);
-#endif
-#ifdef MD2
-  register_hash (&md2_desc);
-#endif
-#ifdef MD4
-  register_hash (&md4_desc);
-#endif
-#ifdef MD5
-  register_hash (&md5_desc);
-#endif
-#ifdef SHA1
-  register_hash (&sha1_desc);
-#endif
-#ifdef SHA256
-  register_hash (&sha256_desc);
-#endif
-#ifdef SHA384
-  register_hash (&sha384_desc);
-#endif
-#ifdef SHA512
-  register_hash (&sha512_desc);
-#endif
-
-}
-
-int time_keysched(void)
-{
-  unsigned long x, i, y1;
-  ulong64 t1, c1;
-  symmetric_key skey;
-  int kl;
-  int    (*func) (const unsigned char *, int , int , symmetric_key *);
-  unsigned char key[256][MAXBLOCKSIZE];
-
-
-  printf ("\n\nKey Schedule Time Trials for the Symmetric Ciphers:\n(Times are cycles per key)\n");
-  for (x = 0; cipher_descriptor[x].name != NULL; x++) {
-#define DO1(k)   func(k, kl, 0, &skey);
-
-    func = cipher_descriptor[x].setup;
-    kl   = cipher_descriptor[x].min_key_length;
-    c1 = (ulong64)-1;
-    for (y1 = 0; y1 < KTIMES; y1++) {
-       for (i = 0; i < 256; i++) {
-          rng_get_bytes(key[i], kl, NULL);    
-       }
-    
-       t_start();
-       for (i = 0; i < 256; i++) {
-          DO1(key[i]);
-       }
-       t1 = t_read() >> 8;
-       if (t1 < c1) { if (y1 > 0) --y1; }
-       c1 = (t1 > c1) ? c1 : t1;
-    }
-    t1 = c1 - skew;
-    printf
-      ("%-20s: Schedule at %6lu\n", cipher_descriptor[x].name, (unsigned long)t1);
-
-#undef DO1
-   }
-   
-   return 0;
-}
-
-int time_cipher(void)
-{
-  unsigned long x, y1;
-  ulong64  t1, t2, c1, c2, a1, a2;
-  symmetric_key skey;
-  void    (*func) (const unsigned char *, unsigned char *, symmetric_key *);
-  unsigned char key[MAXBLOCKSIZE], pt[MAXBLOCKSIZE];
-
-
-  printf ("\n\nECB Time Trials for the Symmetric Ciphers:\n");
-  for (x = 0; cipher_descriptor[x].name != NULL; x++) {
-    cipher_descriptor[x].setup (key, cipher_descriptor[x].min_key_length, 0,
-                &skey);
-
-#define DO1   func(pt,pt,&skey);
-#define DO2   DO1 DO1
-
-    func = cipher_descriptor[x].ecb_encrypt;
-    c1 = c2 = (ulong64)-1;
-    for (y1 = 0; y1 < TIMES; y1++) {
-        t_start();
-        DO1;
-        t1 = t_read();
-        DO2;
-        t2 = t_read();
-        t2 -= t1;
-        
-        c1 = (t1 > c1 ? c1 : t1);
-        c2 = (t2 > c2 ? c2 : t2);
-    }
-    a1 = c2 - c1 - skew;
-        
-        
-    func = cipher_descriptor[x].ecb_decrypt;
-    c1 = c2 = (ulong64)-1;
-    for (y1 = 0; y1 < TIMES; y1++) {
-        t_start();
-        DO1;
-        t1 = t_read();
-        DO2;
-        t2 = t_read();
-        t2 -= t1;
-        
-        c1 = (t1 > c1 ? c1 : t1);
-        c2 = (t2 > c2 ? c2 : t2);
-    }
-    a2 = c2 - c1 - skew;
-    
-    printf
-      ("%-20s: Encrypt at %7.3f, Decrypt at %7.3f\n", cipher_descriptor[x].name, a1/(double)cipher_descriptor[x].block_length, a2/(double)cipher_descriptor[x].block_length);
-
-#undef DO2
-#undef DO1
-   }
-   
-   return 0;
-}
-
-int time_hash(void)
-{
-  unsigned long x, y1, len;
-  ulong64 t1, t2, c1, c2;
-  hash_state md;
-  void    (*func)(hash_state *, const unsigned char *, unsigned long);
-  unsigned char pt[MAXBLOCKSIZE];
-
- 
-  printf ("\n\nHASH Time Trials for:\n");
-  for (x = 0; hash_descriptor[x].name != NULL; x++) {
-    hash_descriptor[x].init(&md);
-
-#define DO1   func(&md,pt,len);
-#define DO2   DO1 DO1
-
-    func = hash_descriptor[x].process;
-    len  = hash_descriptor[x].blocksize;
-    
-    c1 = c2 = (ulong64)-1;
-    for (y1 = 0; y1 < TIMES; y1++) {
-       t_start();
-       DO1;
-       t1 = t_read();
-       DO2;
-       t2 = t_read() - t1;
-       c1 = (t1 > c1) ? c1 : t1;
-       c2 = (t2 > c2) ? c2 : t2;
-    }
-    t1 = c2 - c1 - skew;   
-    t1 = ((t1 * CONST64(1000))) / ((ulong64)hash_descriptor[x].blocksize);
-    
-    printf
-      ("%-20s: Process at %9.3f\n", hash_descriptor[x].name, t1 / 1000.0);
-
-#undef DO2
-#undef DO1
-   }
-   
-   return 0;
-}
-
-int main(void)
-{
-  reg_algs();
-
-  printf("Timings for ciphers and hashes.  Times are listed as cycles per byte processed.\n\n");
-  
-//  init_timer();
-  time_keysched();
-  time_cipher();
-  time_hash();
-  
-  return EXIT_SUCCESS;
-}  
-
+#include <mycrypt.h>
+
+#define KTIMES  25
+#define TIMES   100000
+
+/* RDTSC from Scott Duplichan */
+static ulong64 rdtsc (void)
+   {
+   #if defined __GNUC__
+      #ifdef i386
+         ulong64 a;
+         asm volatile("rdtsc ":"=A" (a));
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+
+ulong64 timer, skew = 0;
+
+void t_start(void)
+{
+   timer = rdtsc();
+}
+
+ulong64 t_read(void)
+{
+   return rdtsc() - timer;
+}
+
+void init_timer(void)
+{
+   ulong64 c1, c2, t1, t2, t3;
+   unsigned long y1;
+   
+   c1 = c2 = (ulong64)-1;
+   for (y1 = 0; y1 < TIMES*100; y1++) {
+      t_start();
+      t1 = t_read();
+      t3 = t_read();
+      t2 = t_read() - t1;
+      
+      c1 = (c1 > t1) ? t1 : c1;
+      c2 = (c2 > t2) ? t2 : c2;
+   }
+   skew = c2 - c1;
+   printf("Clock Skew: %lu\n", (unsigned long)skew);
+}  
+
+void reg_algs(void)
+{
+#ifdef RIJNDAEL
+  register_cipher (&aes_desc);
+#endif
+#ifdef BLOWFISH
+  register_cipher (&blowfish_desc);
+#endif
+#ifdef XTEA
+  register_cipher (&xtea_desc);
+#endif
+#ifdef RC5
+  register_cipher (&rc5_desc);
+#endif
+#ifdef RC6
+  register_cipher (&rc6_desc);
+#endif
+#ifdef SAFERP
+  register_cipher (&saferp_desc);
+#endif
+#ifdef TWOFISH
+  register_cipher (&twofish_desc);
+#endif
+#ifdef SAFER
+  register_cipher (&safer_k64_desc);
+  register_cipher (&safer_sk64_desc);
+  register_cipher (&safer_k128_desc);
+  register_cipher (&safer_sk128_desc);
+#endif
+#ifdef RC2
+  register_cipher (&rc2_desc);
+#endif
+#ifdef DES
+  register_cipher (&des_desc);
+  register_cipher (&des3_desc);
+#endif
+#ifdef CAST5
+  register_cipher (&cast5_desc);
+#endif
+#ifdef NOEKEON
+  register_cipher (&noekeon_desc);
+#endif
+
+#ifdef TIGER
+  register_hash (&tiger_desc);
+#endif
+#ifdef MD2
+  register_hash (&md2_desc);
+#endif
+#ifdef MD4
+  register_hash (&md4_desc);
+#endif
+#ifdef MD5
+  register_hash (&md5_desc);
+#endif
+#ifdef SHA1
+  register_hash (&sha1_desc);
+#endif
+#ifdef SHA256
+  register_hash (&sha256_desc);
+#endif
+#ifdef SHA384
+  register_hash (&sha384_desc);
+#endif
+#ifdef SHA512
+  register_hash (&sha512_desc);
+#endif
+
+}
+
+int time_keysched(void)
+{
+  unsigned long x, i, y1;
+  ulong64 t1, c1;
+  symmetric_key skey;
+  int kl;
+  int    (*func) (const unsigned char *, int , int , symmetric_key *);
+  unsigned char key[MAXBLOCKSIZE];
+
+  printf ("\n\nKey Schedule Time Trials for the Symmetric Ciphers:\n(Times are cycles per key)\n");
+  for (x = 0; cipher_descriptor[x].name != NULL; x++) {
+#define DO1(k)   func(k, kl, 0, &skey);
+
+    func = cipher_descriptor[x].setup;
+    kl   = cipher_descriptor[x].min_key_length;
+    c1 = (ulong64)-1;
+    for (y1 = 0; y1 < KTIMES; y1++) {
+       rng_get_bytes(key, kl, NULL);
+       t_start();
+       DO1(key);
+       t1 = t_read();
+       c1 = (t1 > c1) ? c1 : t1;
+    }
+    t1 = c1 - skew;
+    printf("%-20s: Schedule at %6lu\n", cipher_descriptor[x].name, (unsigned long)t1);
+
+#undef DO1
+   }
+   
+   return 0;
+}
+
+int time_cipher(void)
+{
+  unsigned long x, y1;
+  ulong64  t1, t2, c1, c2, a1, a2;
+  symmetric_key skey;
+  void    (*func) (const unsigned char *, unsigned char *, symmetric_key *);
+  unsigned char key[MAXBLOCKSIZE], pt[MAXBLOCKSIZE];
+
+
+  printf ("\n\nECB Time Trials for the Symmetric Ciphers:\n");
+  for (x = 0; cipher_descriptor[x].name != NULL; x++) {
+    cipher_descriptor[x].setup (key, cipher_descriptor[x].min_key_length, 0,
+                &skey);
+
+#define DO1   func(pt,pt,&skey);
+#define DO2   DO1 DO1
+
+    func = cipher_descriptor[x].ecb_encrypt;
+    c1 = c2 = (ulong64)-1;
+    for (y1 = 0; y1 < TIMES; y1++) {
+        t_start();
+        DO1;
+        t1 = t_read();
+        DO2;
+        t2 = t_read();
+        t2 -= t1;
+        
+        c1 = (t1 > c1 ? c1 : t1);
+        c2 = (t2 > c2 ? c2 : t2);
+    }
+    a1 = c2 - c1 - skew;
+        
+        
+    func = cipher_descriptor[x].ecb_decrypt;
+    c1 = c2 = (ulong64)-1;
+    for (y1 = 0; y1 < TIMES; y1++) {
+        t_start();
+        DO1;
+        t1 = t_read();
+        DO2;
+        t2 = t_read();
+        t2 -= t1;
+        
+        c1 = (t1 > c1 ? c1 : t1);
+        c2 = (t2 > c2 ? c2 : t2);
+    }
+    a2 = c2 - c1 - skew;
+    
+    printf
+      ("%-20s: Encrypt at %7.3f, Decrypt at %7.3f\n", cipher_descriptor[x].name, a1/(double)cipher_descriptor[x].block_length, a2/(double)cipher_descriptor[x].block_length);
+
+#undef DO2
+#undef DO1
+   }
+   
+   return 0;
+}
+
+int time_hash(void)
+{
+  unsigned long x, y1, len;
+  ulong64 t1, t2, c1, c2;
+  hash_state md;
+  void    (*func)(hash_state *, const unsigned char *, unsigned long);
+  unsigned char pt[MAXBLOCKSIZE];
+
+ 
+  printf ("\n\nHASH Time Trials for:\n");
+  for (x = 0; hash_descriptor[x].name != NULL; x++) {
+    hash_descriptor[x].init(&md);
+
+#define DO1   func(&md,pt,len);
+#define DO2   DO1 DO1
+
+    func = hash_descriptor[x].process;
+    len  = hash_descriptor[x].blocksize;
+    
+    c1 = c2 = (ulong64)-1;
+    for (y1 = 0; y1 < TIMES; y1++) {
+       t_start();
+       DO1;
+       t1 = t_read();
+       DO2;
+       t2 = t_read() - t1;
+       c1 = (t1 > c1) ? c1 : t1;
+       c2 = (t2 > c2) ? c2 : t2;
+    }
+    t1 = c2 - c1 - skew;   
+    t1 = ((t1 * CONST64(1000))) / ((ulong64)hash_descriptor[x].blocksize);
+    
+    printf
+      ("%-20s: Process at %9.3f\n", hash_descriptor[x].name, t1 / 1000.0);
+
+#undef DO2
+#undef DO1
+   }
+   
+   return 0;
+}
+
+int main(void)
+{
+  reg_algs();
+
+  printf("Timings for ciphers and hashes.  Times are listed as cycles per byte processed.\n\n");
+  
+//  init_timer();
+  time_cipher();
+  time_keysched();
+  time_hash();
+  
+  return EXIT_SUCCESS;
+}  
+
diff --git a/des.c b/des.c
index eb8aa61..6c5ac35 100644
--- a/des.c
+++ b/des.c
@@ -30,12 +30,12 @@ const struct _cipher_descriptor des3_desc =
     &des3_keysize
 };
 
-static const unsigned long bytebit[8] =
+static const ulong32 bytebit[8] =
 {
     0200, 0100, 040, 020, 010, 04, 02, 01 
 };
 
-static const unsigned long bigbyte[24] =
+static const ulong32 bigbyte[24] =
 {
     0x800000UL,  0x400000UL,  0x200000UL,  0x100000UL,
     0x80000UL,   0x40000UL,   0x20000UL,   0x10000UL,
@@ -69,7 +69,7 @@ static const unsigned char pc2[48] = {
 };
 
 
-static const unsigned long SP1[64] =
+static const ulong32 SP1[64] =
 {
     0x01010400UL, 0x00000000UL, 0x00010000UL, 0x01010404UL,
     0x01010004UL, 0x00010404UL, 0x00000004UL, 0x00010000UL,
@@ -89,7 +89,7 @@ static const unsigned long SP1[64] =
     0x00010004UL, 0x00010400UL, 0x00000000UL, 0x01010004UL
 };
 
-static const unsigned long SP2[64] =
+static const ulong32 SP2[64] =
 {
     0x80108020UL, 0x80008000UL, 0x00008000UL, 0x00108020UL,
     0x00100000UL, 0x00000020UL, 0x80100020UL, 0x80008020UL,
@@ -109,7 +109,7 @@ static const unsigned long SP2[64] =
     0x80000000UL, 0x80100020UL, 0x80108020UL, 0x00108000UL
 };
 
-static const unsigned long SP3[64] =
+static const ulong32 SP3[64] =
 {
     0x00000208UL, 0x08020200UL, 0x00000000UL, 0x08020008UL,
     0x08000200UL, 0x00000000UL, 0x00020208UL, 0x08000200UL,
@@ -129,7 +129,7 @@ static const unsigned long SP3[64] =
     0x00020208UL, 0x00000008UL, 0x08020008UL, 0x00020200UL
 };
 
-static const unsigned long SP4[64] =
+static const ulong32 SP4[64] =
 {
     0x00802001UL, 0x00002081UL, 0x00002081UL, 0x00000080UL,
     0x00802080UL, 0x00800081UL, 0x00800001UL, 0x00002001UL,
@@ -149,7 +149,7 @@ static const unsigned long SP4[64] =
     0x00000080UL, 0x00800000UL, 0x00002000UL, 0x00802080UL
 };
 
-static const unsigned long SP5[64] =
+static const ulong32 SP5[64] =
 {
     0x00000100UL, 0x02080100UL, 0x02080000UL, 0x42000100UL,
     0x00080000UL, 0x00000100UL, 0x40000000UL, 0x02080000UL,
@@ -169,7 +169,7 @@ static const unsigned long SP5[64] =
     0x00000000UL, 0x40080000UL, 0x02080100UL, 0x40000100UL
 };
 
-static const unsigned long SP6[64] =
+static const ulong32 SP6[64] =
 {
     0x20000010UL, 0x20400000UL, 0x00004000UL, 0x20404010UL,
     0x20400000UL, 0x00000010UL, 0x20404010UL, 0x00400000UL,
@@ -189,7 +189,7 @@ static const unsigned long SP6[64] =
     0x20404000UL, 0x20000000UL, 0x00400010UL, 0x20004010UL
 };
 
-static const unsigned long SP7[64] =
+static const ulong32 SP7[64] =
 {
     0x00200000UL, 0x04200002UL, 0x04000802UL, 0x00000000UL,
     0x00000800UL, 0x04000802UL, 0x00200802UL, 0x04200800UL,
@@ -209,7 +209,7 @@ static const unsigned long SP7[64] =
     0x04000002UL, 0x04000800UL, 0x00000800UL, 0x00200002UL
 };
 
-static const unsigned long SP8[64] =
+static const ulong32 SP8[64] =
 {
     0x10001040UL, 0x00001000UL, 0x00040000UL, 0x10041040UL,
     0x10000000UL, 0x10001040UL, 0x00000040UL, 0x10000000UL,
@@ -229,20 +229,1070 @@ static const unsigned long SP8[64] =
     0x00001040UL, 0x00040040UL, 0x10000000UL, 0x10041000UL
 };
 
+#ifndef SMALL_CODE
 
-static void cookey(const unsigned long *raw1, unsigned long *keyout);
+static const ulong64 des_ip[8][256] = {
+
+{ CONST64(0x0000000000000000), CONST64(0x0000001000000000), CONST64(0x0000000000000010), CONST64(0x0000001000000010), 
+  CONST64(0x0000100000000000), CONST64(0x0000101000000000), CONST64(0x0000100000000010), CONST64(0x0000101000000010), 
+  CONST64(0x0000000000001000), CONST64(0x0000001000001000), CONST64(0x0000000000001010), CONST64(0x0000001000001010), 
+  CONST64(0x0000100000001000), CONST64(0x0000101000001000), CONST64(0x0000100000001010), CONST64(0x0000101000001010), 
+  CONST64(0x0010000000000000), CONST64(0x0010001000000000), CONST64(0x0010000000000010), CONST64(0x0010001000000010), 
+  CONST64(0x0010100000000000), CONST64(0x0010101000000000), CONST64(0x0010100000000010), CONST64(0x0010101000000010), 
+  CONST64(0x0010000000001000), CONST64(0x0010001000001000), CONST64(0x0010000000001010), CONST64(0x0010001000001010), 
+  CONST64(0x0010100000001000), CONST64(0x0010101000001000), CONST64(0x0010100000001010), CONST64(0x0010101000001010), 
+  CONST64(0x0000000000100000), CONST64(0x0000001000100000), CONST64(0x0000000000100010), CONST64(0x0000001000100010), 
+  CONST64(0x0000100000100000), CONST64(0x0000101000100000), CONST64(0x0000100000100010), CONST64(0x0000101000100010), 
+  CONST64(0x0000000000101000), CONST64(0x0000001000101000), CONST64(0x0000000000101010), CONST64(0x0000001000101010), 
+  CONST64(0x0000100000101000), CONST64(0x0000101000101000), CONST64(0x0000100000101010), CONST64(0x0000101000101010), 
+  CONST64(0x0010000000100000), CONST64(0x0010001000100000), CONST64(0x0010000000100010), CONST64(0x0010001000100010), 
+  CONST64(0x0010100000100000), CONST64(0x0010101000100000), CONST64(0x0010100000100010), CONST64(0x0010101000100010), 
+  CONST64(0x0010000000101000), CONST64(0x0010001000101000), CONST64(0x0010000000101010), CONST64(0x0010001000101010), 
+  CONST64(0x0010100000101000), CONST64(0x0010101000101000), CONST64(0x0010100000101010), CONST64(0x0010101000101010), 
+  CONST64(0x1000000000000000), CONST64(0x1000001000000000), CONST64(0x1000000000000010), CONST64(0x1000001000000010), 
+  CONST64(0x1000100000000000), CONST64(0x1000101000000000), CONST64(0x1000100000000010), CONST64(0x1000101000000010), 
+  CONST64(0x1000000000001000), CONST64(0x1000001000001000), CONST64(0x1000000000001010), CONST64(0x1000001000001010), 
+  CONST64(0x1000100000001000), CONST64(0x1000101000001000), CONST64(0x1000100000001010), CONST64(0x1000101000001010), 
+  CONST64(0x1010000000000000), CONST64(0x1010001000000000), CONST64(0x1010000000000010), CONST64(0x1010001000000010), 
+  CONST64(0x1010100000000000), CONST64(0x1010101000000000), CONST64(0x1010100000000010), CONST64(0x1010101000000010), 
+  CONST64(0x1010000000001000), CONST64(0x1010001000001000), CONST64(0x1010000000001010), CONST64(0x1010001000001010), 
+  CONST64(0x1010100000001000), CONST64(0x1010101000001000), CONST64(0x1010100000001010), CONST64(0x1010101000001010), 
+  CONST64(0x1000000000100000), CONST64(0x1000001000100000), CONST64(0x1000000000100010), CONST64(0x1000001000100010), 
+  CONST64(0x1000100000100000), CONST64(0x1000101000100000), CONST64(0x1000100000100010), CONST64(0x1000101000100010), 
+  CONST64(0x1000000000101000), CONST64(0x1000001000101000), CONST64(0x1000000000101010), CONST64(0x1000001000101010), 
+  CONST64(0x1000100000101000), CONST64(0x1000101000101000), CONST64(0x1000100000101010), CONST64(0x1000101000101010), 
+  CONST64(0x1010000000100000), CONST64(0x1010001000100000), CONST64(0x1010000000100010), CONST64(0x1010001000100010), 
+  CONST64(0x1010100000100000), CONST64(0x1010101000100000), CONST64(0x1010100000100010), CONST64(0x1010101000100010), 
+  CONST64(0x1010000000101000), CONST64(0x1010001000101000), CONST64(0x1010000000101010), CONST64(0x1010001000101010), 
+  CONST64(0x1010100000101000), CONST64(0x1010101000101000), CONST64(0x1010100000101010), CONST64(0x1010101000101010), 
+  CONST64(0x0000000010000000), CONST64(0x0000001010000000), CONST64(0x0000000010000010), CONST64(0x0000001010000010), 
+  CONST64(0x0000100010000000), CONST64(0x0000101010000000), CONST64(0x0000100010000010), CONST64(0x0000101010000010), 
+  CONST64(0x0000000010001000), CONST64(0x0000001010001000), CONST64(0x0000000010001010), CONST64(0x0000001010001010), 
+  CONST64(0x0000100010001000), CONST64(0x0000101010001000), CONST64(0x0000100010001010), CONST64(0x0000101010001010), 
+  CONST64(0x0010000010000000), CONST64(0x0010001010000000), CONST64(0x0010000010000010), CONST64(0x0010001010000010), 
+  CONST64(0x0010100010000000), CONST64(0x0010101010000000), CONST64(0x0010100010000010), CONST64(0x0010101010000010), 
+  CONST64(0x0010000010001000), CONST64(0x0010001010001000), CONST64(0x0010000010001010), CONST64(0x0010001010001010), 
+  CONST64(0x0010100010001000), CONST64(0x0010101010001000), CONST64(0x0010100010001010), CONST64(0x0010101010001010), 
+  CONST64(0x0000000010100000), CONST64(0x0000001010100000), CONST64(0x0000000010100010), CONST64(0x0000001010100010), 
+  CONST64(0x0000100010100000), CONST64(0x0000101010100000), CONST64(0x0000100010100010), CONST64(0x0000101010100010), 
+  CONST64(0x0000000010101000), CONST64(0x0000001010101000), CONST64(0x0000000010101010), CONST64(0x0000001010101010), 
+  CONST64(0x0000100010101000), CONST64(0x0000101010101000), CONST64(0x0000100010101010), CONST64(0x0000101010101010), 
+  CONST64(0x0010000010100000), CONST64(0x0010001010100000), CONST64(0x0010000010100010), CONST64(0x0010001010100010), 
+  CONST64(0x0010100010100000), CONST64(0x0010101010100000), CONST64(0x0010100010100010), CONST64(0x0010101010100010), 
+  CONST64(0x0010000010101000), CONST64(0x0010001010101000), CONST64(0x0010000010101010), CONST64(0x0010001010101010), 
+  CONST64(0x0010100010101000), CONST64(0x0010101010101000), CONST64(0x0010100010101010), CONST64(0x0010101010101010), 
+  CONST64(0x1000000010000000), CONST64(0x1000001010000000), CONST64(0x1000000010000010), CONST64(0x1000001010000010), 
+  CONST64(0x1000100010000000), CONST64(0x1000101010000000), CONST64(0x1000100010000010), CONST64(0x1000101010000010), 
+  CONST64(0x1000000010001000), CONST64(0x1000001010001000), CONST64(0x1000000010001010), CONST64(0x1000001010001010), 
+  CONST64(0x1000100010001000), CONST64(0x1000101010001000), CONST64(0x1000100010001010), CONST64(0x1000101010001010), 
+  CONST64(0x1010000010000000), CONST64(0x1010001010000000), CONST64(0x1010000010000010), CONST64(0x1010001010000010), 
+  CONST64(0x1010100010000000), CONST64(0x1010101010000000), CONST64(0x1010100010000010), CONST64(0x1010101010000010), 
+  CONST64(0x1010000010001000), CONST64(0x1010001010001000), CONST64(0x1010000010001010), CONST64(0x1010001010001010), 
+  CONST64(0x1010100010001000), CONST64(0x1010101010001000), CONST64(0x1010100010001010), CONST64(0x1010101010001010), 
+  CONST64(0x1000000010100000), CONST64(0x1000001010100000), CONST64(0x1000000010100010), CONST64(0x1000001010100010), 
+  CONST64(0x1000100010100000), CONST64(0x1000101010100000), CONST64(0x1000100010100010), CONST64(0x1000101010100010), 
+  CONST64(0x1000000010101000), CONST64(0x1000001010101000), CONST64(0x1000000010101010), CONST64(0x1000001010101010), 
+  CONST64(0x1000100010101000), CONST64(0x1000101010101000), CONST64(0x1000100010101010), CONST64(0x1000101010101010), 
+  CONST64(0x1010000010100000), CONST64(0x1010001010100000), CONST64(0x1010000010100010), CONST64(0x1010001010100010), 
+  CONST64(0x1010100010100000), CONST64(0x1010101010100000), CONST64(0x1010100010100010), CONST64(0x1010101010100010), 
+  CONST64(0x1010000010101000), CONST64(0x1010001010101000), CONST64(0x1010000010101010), CONST64(0x1010001010101010), 
+  CONST64(0x1010100010101000), CONST64(0x1010101010101000), CONST64(0x1010100010101010), CONST64(0x1010101010101010)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000800000000), CONST64(0x0000000000000008), CONST64(0x0000000800000008), 
+  CONST64(0x0000080000000000), CONST64(0x0000080800000000), CONST64(0x0000080000000008), CONST64(0x0000080800000008), 
+  CONST64(0x0000000000000800), CONST64(0x0000000800000800), CONST64(0x0000000000000808), CONST64(0x0000000800000808), 
+  CONST64(0x0000080000000800), CONST64(0x0000080800000800), CONST64(0x0000080000000808), CONST64(0x0000080800000808), 
+  CONST64(0x0008000000000000), CONST64(0x0008000800000000), CONST64(0x0008000000000008), CONST64(0x0008000800000008), 
+  CONST64(0x0008080000000000), CONST64(0x0008080800000000), CONST64(0x0008080000000008), CONST64(0x0008080800000008), 
+  CONST64(0x0008000000000800), CONST64(0x0008000800000800), CONST64(0x0008000000000808), CONST64(0x0008000800000808), 
+  CONST64(0x0008080000000800), CONST64(0x0008080800000800), CONST64(0x0008080000000808), CONST64(0x0008080800000808), 
+  CONST64(0x0000000000080000), CONST64(0x0000000800080000), CONST64(0x0000000000080008), CONST64(0x0000000800080008), 
+  CONST64(0x0000080000080000), CONST64(0x0000080800080000), CONST64(0x0000080000080008), CONST64(0x0000080800080008), 
+  CONST64(0x0000000000080800), CONST64(0x0000000800080800), CONST64(0x0000000000080808), CONST64(0x0000000800080808), 
+  CONST64(0x0000080000080800), CONST64(0x0000080800080800), CONST64(0x0000080000080808), CONST64(0x0000080800080808), 
+  CONST64(0x0008000000080000), CONST64(0x0008000800080000), CONST64(0x0008000000080008), CONST64(0x0008000800080008), 
+  CONST64(0x0008080000080000), CONST64(0x0008080800080000), CONST64(0x0008080000080008), CONST64(0x0008080800080008), 
+  CONST64(0x0008000000080800), CONST64(0x0008000800080800), CONST64(0x0008000000080808), CONST64(0x0008000800080808), 
+  CONST64(0x0008080000080800), CONST64(0x0008080800080800), CONST64(0x0008080000080808), CONST64(0x0008080800080808), 
+  CONST64(0x0800000000000000), CONST64(0x0800000800000000), CONST64(0x0800000000000008), CONST64(0x0800000800000008), 
+  CONST64(0x0800080000000000), CONST64(0x0800080800000000), CONST64(0x0800080000000008), CONST64(0x0800080800000008), 
+  CONST64(0x0800000000000800), CONST64(0x0800000800000800), CONST64(0x0800000000000808), CONST64(0x0800000800000808), 
+  CONST64(0x0800080000000800), CONST64(0x0800080800000800), CONST64(0x0800080000000808), CONST64(0x0800080800000808), 
+  CONST64(0x0808000000000000), CONST64(0x0808000800000000), CONST64(0x0808000000000008), CONST64(0x0808000800000008), 
+  CONST64(0x0808080000000000), CONST64(0x0808080800000000), CONST64(0x0808080000000008), CONST64(0x0808080800000008), 
+  CONST64(0x0808000000000800), CONST64(0x0808000800000800), CONST64(0x0808000000000808), CONST64(0x0808000800000808), 
+  CONST64(0x0808080000000800), CONST64(0x0808080800000800), CONST64(0x0808080000000808), CONST64(0x0808080800000808), 
+  CONST64(0x0800000000080000), CONST64(0x0800000800080000), CONST64(0x0800000000080008), CONST64(0x0800000800080008), 
+  CONST64(0x0800080000080000), CONST64(0x0800080800080000), CONST64(0x0800080000080008), CONST64(0x0800080800080008), 
+  CONST64(0x0800000000080800), CONST64(0x0800000800080800), CONST64(0x0800000000080808), CONST64(0x0800000800080808), 
+  CONST64(0x0800080000080800), CONST64(0x0800080800080800), CONST64(0x0800080000080808), CONST64(0x0800080800080808), 
+  CONST64(0x0808000000080000), CONST64(0x0808000800080000), CONST64(0x0808000000080008), CONST64(0x0808000800080008), 
+  CONST64(0x0808080000080000), CONST64(0x0808080800080000), CONST64(0x0808080000080008), CONST64(0x0808080800080008), 
+  CONST64(0x0808000000080800), CONST64(0x0808000800080800), CONST64(0x0808000000080808), CONST64(0x0808000800080808), 
+  CONST64(0x0808080000080800), CONST64(0x0808080800080800), CONST64(0x0808080000080808), CONST64(0x0808080800080808), 
+  CONST64(0x0000000008000000), CONST64(0x0000000808000000), CONST64(0x0000000008000008), CONST64(0x0000000808000008), 
+  CONST64(0x0000080008000000), CONST64(0x0000080808000000), CONST64(0x0000080008000008), CONST64(0x0000080808000008), 
+  CONST64(0x0000000008000800), CONST64(0x0000000808000800), CONST64(0x0000000008000808), CONST64(0x0000000808000808), 
+  CONST64(0x0000080008000800), CONST64(0x0000080808000800), CONST64(0x0000080008000808), CONST64(0x0000080808000808), 
+  CONST64(0x0008000008000000), CONST64(0x0008000808000000), CONST64(0x0008000008000008), CONST64(0x0008000808000008), 
+  CONST64(0x0008080008000000), CONST64(0x0008080808000000), CONST64(0x0008080008000008), CONST64(0x0008080808000008), 
+  CONST64(0x0008000008000800), CONST64(0x0008000808000800), CONST64(0x0008000008000808), CONST64(0x0008000808000808), 
+  CONST64(0x0008080008000800), CONST64(0x0008080808000800), CONST64(0x0008080008000808), CONST64(0x0008080808000808), 
+  CONST64(0x0000000008080000), CONST64(0x0000000808080000), CONST64(0x0000000008080008), CONST64(0x0000000808080008), 
+  CONST64(0x0000080008080000), CONST64(0x0000080808080000), CONST64(0x0000080008080008), CONST64(0x0000080808080008), 
+  CONST64(0x0000000008080800), CONST64(0x0000000808080800), CONST64(0x0000000008080808), CONST64(0x0000000808080808), 
+  CONST64(0x0000080008080800), CONST64(0x0000080808080800), CONST64(0x0000080008080808), CONST64(0x0000080808080808), 
+  CONST64(0x0008000008080000), CONST64(0x0008000808080000), CONST64(0x0008000008080008), CONST64(0x0008000808080008), 
+  CONST64(0x0008080008080000), CONST64(0x0008080808080000), CONST64(0x0008080008080008), CONST64(0x0008080808080008), 
+  CONST64(0x0008000008080800), CONST64(0x0008000808080800), CONST64(0x0008000008080808), CONST64(0x0008000808080808), 
+  CONST64(0x0008080008080800), CONST64(0x0008080808080800), CONST64(0x0008080008080808), CONST64(0x0008080808080808), 
+  CONST64(0x0800000008000000), CONST64(0x0800000808000000), CONST64(0x0800000008000008), CONST64(0x0800000808000008), 
+  CONST64(0x0800080008000000), CONST64(0x0800080808000000), CONST64(0x0800080008000008), CONST64(0x0800080808000008), 
+  CONST64(0x0800000008000800), CONST64(0x0800000808000800), CONST64(0x0800000008000808), CONST64(0x0800000808000808), 
+  CONST64(0x0800080008000800), CONST64(0x0800080808000800), CONST64(0x0800080008000808), CONST64(0x0800080808000808), 
+  CONST64(0x0808000008000000), CONST64(0x0808000808000000), CONST64(0x0808000008000008), CONST64(0x0808000808000008), 
+  CONST64(0x0808080008000000), CONST64(0x0808080808000000), CONST64(0x0808080008000008), CONST64(0x0808080808000008), 
+  CONST64(0x0808000008000800), CONST64(0x0808000808000800), CONST64(0x0808000008000808), CONST64(0x0808000808000808), 
+  CONST64(0x0808080008000800), CONST64(0x0808080808000800), CONST64(0x0808080008000808), CONST64(0x0808080808000808), 
+  CONST64(0x0800000008080000), CONST64(0x0800000808080000), CONST64(0x0800000008080008), CONST64(0x0800000808080008), 
+  CONST64(0x0800080008080000), CONST64(0x0800080808080000), CONST64(0x0800080008080008), CONST64(0x0800080808080008), 
+  CONST64(0x0800000008080800), CONST64(0x0800000808080800), CONST64(0x0800000008080808), CONST64(0x0800000808080808), 
+  CONST64(0x0800080008080800), CONST64(0x0800080808080800), CONST64(0x0800080008080808), CONST64(0x0800080808080808), 
+  CONST64(0x0808000008080000), CONST64(0x0808000808080000), CONST64(0x0808000008080008), CONST64(0x0808000808080008), 
+  CONST64(0x0808080008080000), CONST64(0x0808080808080000), CONST64(0x0808080008080008), CONST64(0x0808080808080008), 
+  CONST64(0x0808000008080800), CONST64(0x0808000808080800), CONST64(0x0808000008080808), CONST64(0x0808000808080808), 
+  CONST64(0x0808080008080800), CONST64(0x0808080808080800), CONST64(0x0808080008080808), CONST64(0x0808080808080808)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000400000000), CONST64(0x0000000000000004), CONST64(0x0000000400000004), 
+  CONST64(0x0000040000000000), CONST64(0x0000040400000000), CONST64(0x0000040000000004), CONST64(0x0000040400000004), 
+  CONST64(0x0000000000000400), CONST64(0x0000000400000400), CONST64(0x0000000000000404), CONST64(0x0000000400000404), 
+  CONST64(0x0000040000000400), CONST64(0x0000040400000400), CONST64(0x0000040000000404), CONST64(0x0000040400000404), 
+  CONST64(0x0004000000000000), CONST64(0x0004000400000000), CONST64(0x0004000000000004), CONST64(0x0004000400000004), 
+  CONST64(0x0004040000000000), CONST64(0x0004040400000000), CONST64(0x0004040000000004), CONST64(0x0004040400000004), 
+  CONST64(0x0004000000000400), CONST64(0x0004000400000400), CONST64(0x0004000000000404), CONST64(0x0004000400000404), 
+  CONST64(0x0004040000000400), CONST64(0x0004040400000400), CONST64(0x0004040000000404), CONST64(0x0004040400000404), 
+  CONST64(0x0000000000040000), CONST64(0x0000000400040000), CONST64(0x0000000000040004), CONST64(0x0000000400040004), 
+  CONST64(0x0000040000040000), CONST64(0x0000040400040000), CONST64(0x0000040000040004), CONST64(0x0000040400040004), 
+  CONST64(0x0000000000040400), CONST64(0x0000000400040400), CONST64(0x0000000000040404), CONST64(0x0000000400040404), 
+  CONST64(0x0000040000040400), CONST64(0x0000040400040400), CONST64(0x0000040000040404), CONST64(0x0000040400040404), 
+  CONST64(0x0004000000040000), CONST64(0x0004000400040000), CONST64(0x0004000000040004), CONST64(0x0004000400040004), 
+  CONST64(0x0004040000040000), CONST64(0x0004040400040000), CONST64(0x0004040000040004), CONST64(0x0004040400040004), 
+  CONST64(0x0004000000040400), CONST64(0x0004000400040400), CONST64(0x0004000000040404), CONST64(0x0004000400040404), 
+  CONST64(0x0004040000040400), CONST64(0x0004040400040400), CONST64(0x0004040000040404), CONST64(0x0004040400040404), 
+  CONST64(0x0400000000000000), CONST64(0x0400000400000000), CONST64(0x0400000000000004), CONST64(0x0400000400000004), 
+  CONST64(0x0400040000000000), CONST64(0x0400040400000000), CONST64(0x0400040000000004), CONST64(0x0400040400000004), 
+  CONST64(0x0400000000000400), CONST64(0x0400000400000400), CONST64(0x0400000000000404), CONST64(0x0400000400000404), 
+  CONST64(0x0400040000000400), CONST64(0x0400040400000400), CONST64(0x0400040000000404), CONST64(0x0400040400000404), 
+  CONST64(0x0404000000000000), CONST64(0x0404000400000000), CONST64(0x0404000000000004), CONST64(0x0404000400000004), 
+  CONST64(0x0404040000000000), CONST64(0x0404040400000000), CONST64(0x0404040000000004), CONST64(0x0404040400000004), 
+  CONST64(0x0404000000000400), CONST64(0x0404000400000400), CONST64(0x0404000000000404), CONST64(0x0404000400000404), 
+  CONST64(0x0404040000000400), CONST64(0x0404040400000400), CONST64(0x0404040000000404), CONST64(0x0404040400000404), 
+  CONST64(0x0400000000040000), CONST64(0x0400000400040000), CONST64(0x0400000000040004), CONST64(0x0400000400040004), 
+  CONST64(0x0400040000040000), CONST64(0x0400040400040000), CONST64(0x0400040000040004), CONST64(0x0400040400040004), 
+  CONST64(0x0400000000040400), CONST64(0x0400000400040400), CONST64(0x0400000000040404), CONST64(0x0400000400040404), 
+  CONST64(0x0400040000040400), CONST64(0x0400040400040400), CONST64(0x0400040000040404), CONST64(0x0400040400040404), 
+  CONST64(0x0404000000040000), CONST64(0x0404000400040000), CONST64(0x0404000000040004), CONST64(0x0404000400040004), 
+  CONST64(0x0404040000040000), CONST64(0x0404040400040000), CONST64(0x0404040000040004), CONST64(0x0404040400040004), 
+  CONST64(0x0404000000040400), CONST64(0x0404000400040400), CONST64(0x0404000000040404), CONST64(0x0404000400040404), 
+  CONST64(0x0404040000040400), CONST64(0x0404040400040400), CONST64(0x0404040000040404), CONST64(0x0404040400040404), 
+  CONST64(0x0000000004000000), CONST64(0x0000000404000000), CONST64(0x0000000004000004), CONST64(0x0000000404000004), 
+  CONST64(0x0000040004000000), CONST64(0x0000040404000000), CONST64(0x0000040004000004), CONST64(0x0000040404000004), 
+  CONST64(0x0000000004000400), CONST64(0x0000000404000400), CONST64(0x0000000004000404), CONST64(0x0000000404000404), 
+  CONST64(0x0000040004000400), CONST64(0x0000040404000400), CONST64(0x0000040004000404), CONST64(0x0000040404000404), 
+  CONST64(0x0004000004000000), CONST64(0x0004000404000000), CONST64(0x0004000004000004), CONST64(0x0004000404000004), 
+  CONST64(0x0004040004000000), CONST64(0x0004040404000000), CONST64(0x0004040004000004), CONST64(0x0004040404000004), 
+  CONST64(0x0004000004000400), CONST64(0x0004000404000400), CONST64(0x0004000004000404), CONST64(0x0004000404000404), 
+  CONST64(0x0004040004000400), CONST64(0x0004040404000400), CONST64(0x0004040004000404), CONST64(0x0004040404000404), 
+  CONST64(0x0000000004040000), CONST64(0x0000000404040000), CONST64(0x0000000004040004), CONST64(0x0000000404040004), 
+  CONST64(0x0000040004040000), CONST64(0x0000040404040000), CONST64(0x0000040004040004), CONST64(0x0000040404040004), 
+  CONST64(0x0000000004040400), CONST64(0x0000000404040400), CONST64(0x0000000004040404), CONST64(0x0000000404040404), 
+  CONST64(0x0000040004040400), CONST64(0x0000040404040400), CONST64(0x0000040004040404), CONST64(0x0000040404040404), 
+  CONST64(0x0004000004040000), CONST64(0x0004000404040000), CONST64(0x0004000004040004), CONST64(0x0004000404040004), 
+  CONST64(0x0004040004040000), CONST64(0x0004040404040000), CONST64(0x0004040004040004), CONST64(0x0004040404040004), 
+  CONST64(0x0004000004040400), CONST64(0x0004000404040400), CONST64(0x0004000004040404), CONST64(0x0004000404040404), 
+  CONST64(0x0004040004040400), CONST64(0x0004040404040400), CONST64(0x0004040004040404), CONST64(0x0004040404040404), 
+  CONST64(0x0400000004000000), CONST64(0x0400000404000000), CONST64(0x0400000004000004), CONST64(0x0400000404000004), 
+  CONST64(0x0400040004000000), CONST64(0x0400040404000000), CONST64(0x0400040004000004), CONST64(0x0400040404000004), 
+  CONST64(0x0400000004000400), CONST64(0x0400000404000400), CONST64(0x0400000004000404), CONST64(0x0400000404000404), 
+  CONST64(0x0400040004000400), CONST64(0x0400040404000400), CONST64(0x0400040004000404), CONST64(0x0400040404000404), 
+  CONST64(0x0404000004000000), CONST64(0x0404000404000000), CONST64(0x0404000004000004), CONST64(0x0404000404000004), 
+  CONST64(0x0404040004000000), CONST64(0x0404040404000000), CONST64(0x0404040004000004), CONST64(0x0404040404000004), 
+  CONST64(0x0404000004000400), CONST64(0x0404000404000400), CONST64(0x0404000004000404), CONST64(0x0404000404000404), 
+  CONST64(0x0404040004000400), CONST64(0x0404040404000400), CONST64(0x0404040004000404), CONST64(0x0404040404000404), 
+  CONST64(0x0400000004040000), CONST64(0x0400000404040000), CONST64(0x0400000004040004), CONST64(0x0400000404040004), 
+  CONST64(0x0400040004040000), CONST64(0x0400040404040000), CONST64(0x0400040004040004), CONST64(0x0400040404040004), 
+  CONST64(0x0400000004040400), CONST64(0x0400000404040400), CONST64(0x0400000004040404), CONST64(0x0400000404040404), 
+  CONST64(0x0400040004040400), CONST64(0x0400040404040400), CONST64(0x0400040004040404), CONST64(0x0400040404040404), 
+  CONST64(0x0404000004040000), CONST64(0x0404000404040000), CONST64(0x0404000004040004), CONST64(0x0404000404040004), 
+  CONST64(0x0404040004040000), CONST64(0x0404040404040000), CONST64(0x0404040004040004), CONST64(0x0404040404040004), 
+  CONST64(0x0404000004040400), CONST64(0x0404000404040400), CONST64(0x0404000004040404), CONST64(0x0404000404040404), 
+  CONST64(0x0404040004040400), CONST64(0x0404040404040400), CONST64(0x0404040004040404), CONST64(0x0404040404040404)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000200000000), CONST64(0x0000000000000002), CONST64(0x0000000200000002), 
+  CONST64(0x0000020000000000), CONST64(0x0000020200000000), CONST64(0x0000020000000002), CONST64(0x0000020200000002), 
+  CONST64(0x0000000000000200), CONST64(0x0000000200000200), CONST64(0x0000000000000202), CONST64(0x0000000200000202), 
+  CONST64(0x0000020000000200), CONST64(0x0000020200000200), CONST64(0x0000020000000202), CONST64(0x0000020200000202), 
+  CONST64(0x0002000000000000), CONST64(0x0002000200000000), CONST64(0x0002000000000002), CONST64(0x0002000200000002), 
+  CONST64(0x0002020000000000), CONST64(0x0002020200000000), CONST64(0x0002020000000002), CONST64(0x0002020200000002), 
+  CONST64(0x0002000000000200), CONST64(0x0002000200000200), CONST64(0x0002000000000202), CONST64(0x0002000200000202), 
+  CONST64(0x0002020000000200), CONST64(0x0002020200000200), CONST64(0x0002020000000202), CONST64(0x0002020200000202), 
+  CONST64(0x0000000000020000), CONST64(0x0000000200020000), CONST64(0x0000000000020002), CONST64(0x0000000200020002), 
+  CONST64(0x0000020000020000), CONST64(0x0000020200020000), CONST64(0x0000020000020002), CONST64(0x0000020200020002), 
+  CONST64(0x0000000000020200), CONST64(0x0000000200020200), CONST64(0x0000000000020202), CONST64(0x0000000200020202), 
+  CONST64(0x0000020000020200), CONST64(0x0000020200020200), CONST64(0x0000020000020202), CONST64(0x0000020200020202), 
+  CONST64(0x0002000000020000), CONST64(0x0002000200020000), CONST64(0x0002000000020002), CONST64(0x0002000200020002), 
+  CONST64(0x0002020000020000), CONST64(0x0002020200020000), CONST64(0x0002020000020002), CONST64(0x0002020200020002), 
+  CONST64(0x0002000000020200), CONST64(0x0002000200020200), CONST64(0x0002000000020202), CONST64(0x0002000200020202), 
+  CONST64(0x0002020000020200), CONST64(0x0002020200020200), CONST64(0x0002020000020202), CONST64(0x0002020200020202), 
+  CONST64(0x0200000000000000), CONST64(0x0200000200000000), CONST64(0x0200000000000002), CONST64(0x0200000200000002), 
+  CONST64(0x0200020000000000), CONST64(0x0200020200000000), CONST64(0x0200020000000002), CONST64(0x0200020200000002), 
+  CONST64(0x0200000000000200), CONST64(0x0200000200000200), CONST64(0x0200000000000202), CONST64(0x0200000200000202), 
+  CONST64(0x0200020000000200), CONST64(0x0200020200000200), CONST64(0x0200020000000202), CONST64(0x0200020200000202), 
+  CONST64(0x0202000000000000), CONST64(0x0202000200000000), CONST64(0x0202000000000002), CONST64(0x0202000200000002), 
+  CONST64(0x0202020000000000), CONST64(0x0202020200000000), CONST64(0x0202020000000002), CONST64(0x0202020200000002), 
+  CONST64(0x0202000000000200), CONST64(0x0202000200000200), CONST64(0x0202000000000202), CONST64(0x0202000200000202), 
+  CONST64(0x0202020000000200), CONST64(0x0202020200000200), CONST64(0x0202020000000202), CONST64(0x0202020200000202), 
+  CONST64(0x0200000000020000), CONST64(0x0200000200020000), CONST64(0x0200000000020002), CONST64(0x0200000200020002), 
+  CONST64(0x0200020000020000), CONST64(0x0200020200020000), CONST64(0x0200020000020002), CONST64(0x0200020200020002), 
+  CONST64(0x0200000000020200), CONST64(0x0200000200020200), CONST64(0x0200000000020202), CONST64(0x0200000200020202), 
+  CONST64(0x0200020000020200), CONST64(0x0200020200020200), CONST64(0x0200020000020202), CONST64(0x0200020200020202), 
+  CONST64(0x0202000000020000), CONST64(0x0202000200020000), CONST64(0x0202000000020002), CONST64(0x0202000200020002), 
+  CONST64(0x0202020000020000), CONST64(0x0202020200020000), CONST64(0x0202020000020002), CONST64(0x0202020200020002), 
+  CONST64(0x0202000000020200), CONST64(0x0202000200020200), CONST64(0x0202000000020202), CONST64(0x0202000200020202), 
+  CONST64(0x0202020000020200), CONST64(0x0202020200020200), CONST64(0x0202020000020202), CONST64(0x0202020200020202), 
+  CONST64(0x0000000002000000), CONST64(0x0000000202000000), CONST64(0x0000000002000002), CONST64(0x0000000202000002), 
+  CONST64(0x0000020002000000), CONST64(0x0000020202000000), CONST64(0x0000020002000002), CONST64(0x0000020202000002), 
+  CONST64(0x0000000002000200), CONST64(0x0000000202000200), CONST64(0x0000000002000202), CONST64(0x0000000202000202), 
+  CONST64(0x0000020002000200), CONST64(0x0000020202000200), CONST64(0x0000020002000202), CONST64(0x0000020202000202), 
+  CONST64(0x0002000002000000), CONST64(0x0002000202000000), CONST64(0x0002000002000002), CONST64(0x0002000202000002), 
+  CONST64(0x0002020002000000), CONST64(0x0002020202000000), CONST64(0x0002020002000002), CONST64(0x0002020202000002), 
+  CONST64(0x0002000002000200), CONST64(0x0002000202000200), CONST64(0x0002000002000202), CONST64(0x0002000202000202), 
+  CONST64(0x0002020002000200), CONST64(0x0002020202000200), CONST64(0x0002020002000202), CONST64(0x0002020202000202), 
+  CONST64(0x0000000002020000), CONST64(0x0000000202020000), CONST64(0x0000000002020002), CONST64(0x0000000202020002), 
+  CONST64(0x0000020002020000), CONST64(0x0000020202020000), CONST64(0x0000020002020002), CONST64(0x0000020202020002), 
+  CONST64(0x0000000002020200), CONST64(0x0000000202020200), CONST64(0x0000000002020202), CONST64(0x0000000202020202), 
+  CONST64(0x0000020002020200), CONST64(0x0000020202020200), CONST64(0x0000020002020202), CONST64(0x0000020202020202), 
+  CONST64(0x0002000002020000), CONST64(0x0002000202020000), CONST64(0x0002000002020002), CONST64(0x0002000202020002), 
+  CONST64(0x0002020002020000), CONST64(0x0002020202020000), CONST64(0x0002020002020002), CONST64(0x0002020202020002), 
+  CONST64(0x0002000002020200), CONST64(0x0002000202020200), CONST64(0x0002000002020202), CONST64(0x0002000202020202), 
+  CONST64(0x0002020002020200), CONST64(0x0002020202020200), CONST64(0x0002020002020202), CONST64(0x0002020202020202), 
+  CONST64(0x0200000002000000), CONST64(0x0200000202000000), CONST64(0x0200000002000002), CONST64(0x0200000202000002), 
+  CONST64(0x0200020002000000), CONST64(0x0200020202000000), CONST64(0x0200020002000002), CONST64(0x0200020202000002), 
+  CONST64(0x0200000002000200), CONST64(0x0200000202000200), CONST64(0x0200000002000202), CONST64(0x0200000202000202), 
+  CONST64(0x0200020002000200), CONST64(0x0200020202000200), CONST64(0x0200020002000202), CONST64(0x0200020202000202), 
+  CONST64(0x0202000002000000), CONST64(0x0202000202000000), CONST64(0x0202000002000002), CONST64(0x0202000202000002), 
+  CONST64(0x0202020002000000), CONST64(0x0202020202000000), CONST64(0x0202020002000002), CONST64(0x0202020202000002), 
+  CONST64(0x0202000002000200), CONST64(0x0202000202000200), CONST64(0x0202000002000202), CONST64(0x0202000202000202), 
+  CONST64(0x0202020002000200), CONST64(0x0202020202000200), CONST64(0x0202020002000202), CONST64(0x0202020202000202), 
+  CONST64(0x0200000002020000), CONST64(0x0200000202020000), CONST64(0x0200000002020002), CONST64(0x0200000202020002), 
+  CONST64(0x0200020002020000), CONST64(0x0200020202020000), CONST64(0x0200020002020002), CONST64(0x0200020202020002), 
+  CONST64(0x0200000002020200), CONST64(0x0200000202020200), CONST64(0x0200000002020202), CONST64(0x0200000202020202), 
+  CONST64(0x0200020002020200), CONST64(0x0200020202020200), CONST64(0x0200020002020202), CONST64(0x0200020202020202), 
+  CONST64(0x0202000002020000), CONST64(0x0202000202020000), CONST64(0x0202000002020002), CONST64(0x0202000202020002), 
+  CONST64(0x0202020002020000), CONST64(0x0202020202020000), CONST64(0x0202020002020002), CONST64(0x0202020202020002), 
+  CONST64(0x0202000002020200), CONST64(0x0202000202020200), CONST64(0x0202000002020202), CONST64(0x0202000202020202), 
+  CONST64(0x0202020002020200), CONST64(0x0202020202020200), CONST64(0x0202020002020202), CONST64(0x0202020202020202)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000010000000000), CONST64(0x0000000000000100), CONST64(0x0000010000000100), 
+  CONST64(0x0001000000000000), CONST64(0x0001010000000000), CONST64(0x0001000000000100), CONST64(0x0001010000000100), 
+  CONST64(0x0000000000010000), CONST64(0x0000010000010000), CONST64(0x0000000000010100), CONST64(0x0000010000010100), 
+  CONST64(0x0001000000010000), CONST64(0x0001010000010000), CONST64(0x0001000000010100), CONST64(0x0001010000010100), 
+  CONST64(0x0100000000000000), CONST64(0x0100010000000000), CONST64(0x0100000000000100), CONST64(0x0100010000000100), 
+  CONST64(0x0101000000000000), CONST64(0x0101010000000000), CONST64(0x0101000000000100), CONST64(0x0101010000000100), 
+  CONST64(0x0100000000010000), CONST64(0x0100010000010000), CONST64(0x0100000000010100), CONST64(0x0100010000010100), 
+  CONST64(0x0101000000010000), CONST64(0x0101010000010000), CONST64(0x0101000000010100), CONST64(0x0101010000010100), 
+  CONST64(0x0000000001000000), CONST64(0x0000010001000000), CONST64(0x0000000001000100), CONST64(0x0000010001000100), 
+  CONST64(0x0001000001000000), CONST64(0x0001010001000000), CONST64(0x0001000001000100), CONST64(0x0001010001000100), 
+  CONST64(0x0000000001010000), CONST64(0x0000010001010000), CONST64(0x0000000001010100), CONST64(0x0000010001010100), 
+  CONST64(0x0001000001010000), CONST64(0x0001010001010000), CONST64(0x0001000001010100), CONST64(0x0001010001010100), 
+  CONST64(0x0100000001000000), CONST64(0x0100010001000000), CONST64(0x0100000001000100), CONST64(0x0100010001000100), 
+  CONST64(0x0101000001000000), CONST64(0x0101010001000000), CONST64(0x0101000001000100), CONST64(0x0101010001000100), 
+  CONST64(0x0100000001010000), CONST64(0x0100010001010000), CONST64(0x0100000001010100), CONST64(0x0100010001010100), 
+  CONST64(0x0101000001010000), CONST64(0x0101010001010000), CONST64(0x0101000001010100), CONST64(0x0101010001010100), 
+  CONST64(0x0000000100000000), CONST64(0x0000010100000000), CONST64(0x0000000100000100), CONST64(0x0000010100000100), 
+  CONST64(0x0001000100000000), CONST64(0x0001010100000000), CONST64(0x0001000100000100), CONST64(0x0001010100000100), 
+  CONST64(0x0000000100010000), CONST64(0x0000010100010000), CONST64(0x0000000100010100), CONST64(0x0000010100010100), 
+  CONST64(0x0001000100010000), CONST64(0x0001010100010000), CONST64(0x0001000100010100), CONST64(0x0001010100010100), 
+  CONST64(0x0100000100000000), CONST64(0x0100010100000000), CONST64(0x0100000100000100), CONST64(0x0100010100000100), 
+  CONST64(0x0101000100000000), CONST64(0x0101010100000000), CONST64(0x0101000100000100), CONST64(0x0101010100000100), 
+  CONST64(0x0100000100010000), CONST64(0x0100010100010000), CONST64(0x0100000100010100), CONST64(0x0100010100010100), 
+  CONST64(0x0101000100010000), CONST64(0x0101010100010000), CONST64(0x0101000100010100), CONST64(0x0101010100010100), 
+  CONST64(0x0000000101000000), CONST64(0x0000010101000000), CONST64(0x0000000101000100), CONST64(0x0000010101000100), 
+  CONST64(0x0001000101000000), CONST64(0x0001010101000000), CONST64(0x0001000101000100), CONST64(0x0001010101000100), 
+  CONST64(0x0000000101010000), CONST64(0x0000010101010000), CONST64(0x0000000101010100), CONST64(0x0000010101010100), 
+  CONST64(0x0001000101010000), CONST64(0x0001010101010000), CONST64(0x0001000101010100), CONST64(0x0001010101010100), 
+  CONST64(0x0100000101000000), CONST64(0x0100010101000000), CONST64(0x0100000101000100), CONST64(0x0100010101000100), 
+  CONST64(0x0101000101000000), CONST64(0x0101010101000000), CONST64(0x0101000101000100), CONST64(0x0101010101000100), 
+  CONST64(0x0100000101010000), CONST64(0x0100010101010000), CONST64(0x0100000101010100), CONST64(0x0100010101010100), 
+  CONST64(0x0101000101010000), CONST64(0x0101010101010000), CONST64(0x0101000101010100), CONST64(0x0101010101010100), 
+  CONST64(0x0000000000000001), CONST64(0x0000010000000001), CONST64(0x0000000000000101), CONST64(0x0000010000000101), 
+  CONST64(0x0001000000000001), CONST64(0x0001010000000001), CONST64(0x0001000000000101), CONST64(0x0001010000000101), 
+  CONST64(0x0000000000010001), CONST64(0x0000010000010001), CONST64(0x0000000000010101), CONST64(0x0000010000010101), 
+  CONST64(0x0001000000010001), CONST64(0x0001010000010001), CONST64(0x0001000000010101), CONST64(0x0001010000010101), 
+  CONST64(0x0100000000000001), CONST64(0x0100010000000001), CONST64(0x0100000000000101), CONST64(0x0100010000000101), 
+  CONST64(0x0101000000000001), CONST64(0x0101010000000001), CONST64(0x0101000000000101), CONST64(0x0101010000000101), 
+  CONST64(0x0100000000010001), CONST64(0x0100010000010001), CONST64(0x0100000000010101), CONST64(0x0100010000010101), 
+  CONST64(0x0101000000010001), CONST64(0x0101010000010001), CONST64(0x0101000000010101), CONST64(0x0101010000010101), 
+  CONST64(0x0000000001000001), CONST64(0x0000010001000001), CONST64(0x0000000001000101), CONST64(0x0000010001000101), 
+  CONST64(0x0001000001000001), CONST64(0x0001010001000001), CONST64(0x0001000001000101), CONST64(0x0001010001000101), 
+  CONST64(0x0000000001010001), CONST64(0x0000010001010001), CONST64(0x0000000001010101), CONST64(0x0000010001010101), 
+  CONST64(0x0001000001010001), CONST64(0x0001010001010001), CONST64(0x0001000001010101), CONST64(0x0001010001010101), 
+  CONST64(0x0100000001000001), CONST64(0x0100010001000001), CONST64(0x0100000001000101), CONST64(0x0100010001000101), 
+  CONST64(0x0101000001000001), CONST64(0x0101010001000001), CONST64(0x0101000001000101), CONST64(0x0101010001000101), 
+  CONST64(0x0100000001010001), CONST64(0x0100010001010001), CONST64(0x0100000001010101), CONST64(0x0100010001010101), 
+  CONST64(0x0101000001010001), CONST64(0x0101010001010001), CONST64(0x0101000001010101), CONST64(0x0101010001010101), 
+  CONST64(0x0000000100000001), CONST64(0x0000010100000001), CONST64(0x0000000100000101), CONST64(0x0000010100000101), 
+  CONST64(0x0001000100000001), CONST64(0x0001010100000001), CONST64(0x0001000100000101), CONST64(0x0001010100000101), 
+  CONST64(0x0000000100010001), CONST64(0x0000010100010001), CONST64(0x0000000100010101), CONST64(0x0000010100010101), 
+  CONST64(0x0001000100010001), CONST64(0x0001010100010001), CONST64(0x0001000100010101), CONST64(0x0001010100010101), 
+  CONST64(0x0100000100000001), CONST64(0x0100010100000001), CONST64(0x0100000100000101), CONST64(0x0100010100000101), 
+  CONST64(0x0101000100000001), CONST64(0x0101010100000001), CONST64(0x0101000100000101), CONST64(0x0101010100000101), 
+  CONST64(0x0100000100010001), CONST64(0x0100010100010001), CONST64(0x0100000100010101), CONST64(0x0100010100010101), 
+  CONST64(0x0101000100010001), CONST64(0x0101010100010001), CONST64(0x0101000100010101), CONST64(0x0101010100010101), 
+  CONST64(0x0000000101000001), CONST64(0x0000010101000001), CONST64(0x0000000101000101), CONST64(0x0000010101000101), 
+  CONST64(0x0001000101000001), CONST64(0x0001010101000001), CONST64(0x0001000101000101), CONST64(0x0001010101000101), 
+  CONST64(0x0000000101010001), CONST64(0x0000010101010001), CONST64(0x0000000101010101), CONST64(0x0000010101010101), 
+  CONST64(0x0001000101010001), CONST64(0x0001010101010001), CONST64(0x0001000101010101), CONST64(0x0001010101010101), 
+  CONST64(0x0100000101000001), CONST64(0x0100010101000001), CONST64(0x0100000101000101), CONST64(0x0100010101000101), 
+  CONST64(0x0101000101000001), CONST64(0x0101010101000001), CONST64(0x0101000101000101), CONST64(0x0101010101000101), 
+  CONST64(0x0100000101010001), CONST64(0x0100010101010001), CONST64(0x0100000101010101), CONST64(0x0100010101010101), 
+  CONST64(0x0101000101010001), CONST64(0x0101010101010001), CONST64(0x0101000101010101), CONST64(0x0101010101010101)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000008000000000), CONST64(0x0000000000000080), CONST64(0x0000008000000080), 
+  CONST64(0x0000800000000000), CONST64(0x0000808000000000), CONST64(0x0000800000000080), CONST64(0x0000808000000080), 
+  CONST64(0x0000000000008000), CONST64(0x0000008000008000), CONST64(0x0000000000008080), CONST64(0x0000008000008080), 
+  CONST64(0x0000800000008000), CONST64(0x0000808000008000), CONST64(0x0000800000008080), CONST64(0x0000808000008080), 
+  CONST64(0x0080000000000000), CONST64(0x0080008000000000), CONST64(0x0080000000000080), CONST64(0x0080008000000080), 
+  CONST64(0x0080800000000000), CONST64(0x0080808000000000), CONST64(0x0080800000000080), CONST64(0x0080808000000080), 
+  CONST64(0x0080000000008000), CONST64(0x0080008000008000), CONST64(0x0080000000008080), CONST64(0x0080008000008080), 
+  CONST64(0x0080800000008000), CONST64(0x0080808000008000), CONST64(0x0080800000008080), CONST64(0x0080808000008080), 
+  CONST64(0x0000000000800000), CONST64(0x0000008000800000), CONST64(0x0000000000800080), CONST64(0x0000008000800080), 
+  CONST64(0x0000800000800000), CONST64(0x0000808000800000), CONST64(0x0000800000800080), CONST64(0x0000808000800080), 
+  CONST64(0x0000000000808000), CONST64(0x0000008000808000), CONST64(0x0000000000808080), CONST64(0x0000008000808080), 
+  CONST64(0x0000800000808000), CONST64(0x0000808000808000), CONST64(0x0000800000808080), CONST64(0x0000808000808080), 
+  CONST64(0x0080000000800000), CONST64(0x0080008000800000), CONST64(0x0080000000800080), CONST64(0x0080008000800080), 
+  CONST64(0x0080800000800000), CONST64(0x0080808000800000), CONST64(0x0080800000800080), CONST64(0x0080808000800080), 
+  CONST64(0x0080000000808000), CONST64(0x0080008000808000), CONST64(0x0080000000808080), CONST64(0x0080008000808080), 
+  CONST64(0x0080800000808000), CONST64(0x0080808000808000), CONST64(0x0080800000808080), CONST64(0x0080808000808080), 
+  CONST64(0x8000000000000000), CONST64(0x8000008000000000), CONST64(0x8000000000000080), CONST64(0x8000008000000080), 
+  CONST64(0x8000800000000000), CONST64(0x8000808000000000), CONST64(0x8000800000000080), CONST64(0x8000808000000080), 
+  CONST64(0x8000000000008000), CONST64(0x8000008000008000), CONST64(0x8000000000008080), CONST64(0x8000008000008080), 
+  CONST64(0x8000800000008000), CONST64(0x8000808000008000), CONST64(0x8000800000008080), CONST64(0x8000808000008080), 
+  CONST64(0x8080000000000000), CONST64(0x8080008000000000), CONST64(0x8080000000000080), CONST64(0x8080008000000080), 
+  CONST64(0x8080800000000000), CONST64(0x8080808000000000), CONST64(0x8080800000000080), CONST64(0x8080808000000080), 
+  CONST64(0x8080000000008000), CONST64(0x8080008000008000), CONST64(0x8080000000008080), CONST64(0x8080008000008080), 
+  CONST64(0x8080800000008000), CONST64(0x8080808000008000), CONST64(0x8080800000008080), CONST64(0x8080808000008080), 
+  CONST64(0x8000000000800000), CONST64(0x8000008000800000), CONST64(0x8000000000800080), CONST64(0x8000008000800080), 
+  CONST64(0x8000800000800000), CONST64(0x8000808000800000), CONST64(0x8000800000800080), CONST64(0x8000808000800080), 
+  CONST64(0x8000000000808000), CONST64(0x8000008000808000), CONST64(0x8000000000808080), CONST64(0x8000008000808080), 
+  CONST64(0x8000800000808000), CONST64(0x8000808000808000), CONST64(0x8000800000808080), CONST64(0x8000808000808080), 
+  CONST64(0x8080000000800000), CONST64(0x8080008000800000), CONST64(0x8080000000800080), CONST64(0x8080008000800080), 
+  CONST64(0x8080800000800000), CONST64(0x8080808000800000), CONST64(0x8080800000800080), CONST64(0x8080808000800080), 
+  CONST64(0x8080000000808000), CONST64(0x8080008000808000), CONST64(0x8080000000808080), CONST64(0x8080008000808080), 
+  CONST64(0x8080800000808000), CONST64(0x8080808000808000), CONST64(0x8080800000808080), CONST64(0x8080808000808080), 
+  CONST64(0x0000000080000000), CONST64(0x0000008080000000), CONST64(0x0000000080000080), CONST64(0x0000008080000080), 
+  CONST64(0x0000800080000000), CONST64(0x0000808080000000), CONST64(0x0000800080000080), CONST64(0x0000808080000080), 
+  CONST64(0x0000000080008000), CONST64(0x0000008080008000), CONST64(0x0000000080008080), CONST64(0x0000008080008080), 
+  CONST64(0x0000800080008000), CONST64(0x0000808080008000), CONST64(0x0000800080008080), CONST64(0x0000808080008080), 
+  CONST64(0x0080000080000000), CONST64(0x0080008080000000), CONST64(0x0080000080000080), CONST64(0x0080008080000080), 
+  CONST64(0x0080800080000000), CONST64(0x0080808080000000), CONST64(0x0080800080000080), CONST64(0x0080808080000080), 
+  CONST64(0x0080000080008000), CONST64(0x0080008080008000), CONST64(0x0080000080008080), CONST64(0x0080008080008080), 
+  CONST64(0x0080800080008000), CONST64(0x0080808080008000), CONST64(0x0080800080008080), CONST64(0x0080808080008080), 
+  CONST64(0x0000000080800000), CONST64(0x0000008080800000), CONST64(0x0000000080800080), CONST64(0x0000008080800080), 
+  CONST64(0x0000800080800000), CONST64(0x0000808080800000), CONST64(0x0000800080800080), CONST64(0x0000808080800080), 
+  CONST64(0x0000000080808000), CONST64(0x0000008080808000), CONST64(0x0000000080808080), CONST64(0x0000008080808080), 
+  CONST64(0x0000800080808000), CONST64(0x0000808080808000), CONST64(0x0000800080808080), CONST64(0x0000808080808080), 
+  CONST64(0x0080000080800000), CONST64(0x0080008080800000), CONST64(0x0080000080800080), CONST64(0x0080008080800080), 
+  CONST64(0x0080800080800000), CONST64(0x0080808080800000), CONST64(0x0080800080800080), CONST64(0x0080808080800080), 
+  CONST64(0x0080000080808000), CONST64(0x0080008080808000), CONST64(0x0080000080808080), CONST64(0x0080008080808080), 
+  CONST64(0x0080800080808000), CONST64(0x0080808080808000), CONST64(0x0080800080808080), CONST64(0x0080808080808080), 
+  CONST64(0x8000000080000000), CONST64(0x8000008080000000), CONST64(0x8000000080000080), CONST64(0x8000008080000080), 
+  CONST64(0x8000800080000000), CONST64(0x8000808080000000), CONST64(0x8000800080000080), CONST64(0x8000808080000080), 
+  CONST64(0x8000000080008000), CONST64(0x8000008080008000), CONST64(0x8000000080008080), CONST64(0x8000008080008080), 
+  CONST64(0x8000800080008000), CONST64(0x8000808080008000), CONST64(0x8000800080008080), CONST64(0x8000808080008080), 
+  CONST64(0x8080000080000000), CONST64(0x8080008080000000), CONST64(0x8080000080000080), CONST64(0x8080008080000080), 
+  CONST64(0x8080800080000000), CONST64(0x8080808080000000), CONST64(0x8080800080000080), CONST64(0x8080808080000080), 
+  CONST64(0x8080000080008000), CONST64(0x8080008080008000), CONST64(0x8080000080008080), CONST64(0x8080008080008080), 
+  CONST64(0x8080800080008000), CONST64(0x8080808080008000), CONST64(0x8080800080008080), CONST64(0x8080808080008080), 
+  CONST64(0x8000000080800000), CONST64(0x8000008080800000), CONST64(0x8000000080800080), CONST64(0x8000008080800080), 
+  CONST64(0x8000800080800000), CONST64(0x8000808080800000), CONST64(0x8000800080800080), CONST64(0x8000808080800080), 
+  CONST64(0x8000000080808000), CONST64(0x8000008080808000), CONST64(0x8000000080808080), CONST64(0x8000008080808080), 
+  CONST64(0x8000800080808000), CONST64(0x8000808080808000), CONST64(0x8000800080808080), CONST64(0x8000808080808080), 
+  CONST64(0x8080000080800000), CONST64(0x8080008080800000), CONST64(0x8080000080800080), CONST64(0x8080008080800080), 
+  CONST64(0x8080800080800000), CONST64(0x8080808080800000), CONST64(0x8080800080800080), CONST64(0x8080808080800080), 
+  CONST64(0x8080000080808000), CONST64(0x8080008080808000), CONST64(0x8080000080808080), CONST64(0x8080008080808080), 
+  CONST64(0x8080800080808000), CONST64(0x8080808080808000), CONST64(0x8080800080808080), CONST64(0x8080808080808080)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000004000000000), CONST64(0x0000000000000040), CONST64(0x0000004000000040), 
+  CONST64(0x0000400000000000), CONST64(0x0000404000000000), CONST64(0x0000400000000040), CONST64(0x0000404000000040), 
+  CONST64(0x0000000000004000), CONST64(0x0000004000004000), CONST64(0x0000000000004040), CONST64(0x0000004000004040), 
+  CONST64(0x0000400000004000), CONST64(0x0000404000004000), CONST64(0x0000400000004040), CONST64(0x0000404000004040), 
+  CONST64(0x0040000000000000), CONST64(0x0040004000000000), CONST64(0x0040000000000040), CONST64(0x0040004000000040), 
+  CONST64(0x0040400000000000), CONST64(0x0040404000000000), CONST64(0x0040400000000040), CONST64(0x0040404000000040), 
+  CONST64(0x0040000000004000), CONST64(0x0040004000004000), CONST64(0x0040000000004040), CONST64(0x0040004000004040), 
+  CONST64(0x0040400000004000), CONST64(0x0040404000004000), CONST64(0x0040400000004040), CONST64(0x0040404000004040), 
+  CONST64(0x0000000000400000), CONST64(0x0000004000400000), CONST64(0x0000000000400040), CONST64(0x0000004000400040), 
+  CONST64(0x0000400000400000), CONST64(0x0000404000400000), CONST64(0x0000400000400040), CONST64(0x0000404000400040), 
+  CONST64(0x0000000000404000), CONST64(0x0000004000404000), CONST64(0x0000000000404040), CONST64(0x0000004000404040), 
+  CONST64(0x0000400000404000), CONST64(0x0000404000404000), CONST64(0x0000400000404040), CONST64(0x0000404000404040), 
+  CONST64(0x0040000000400000), CONST64(0x0040004000400000), CONST64(0x0040000000400040), CONST64(0x0040004000400040), 
+  CONST64(0x0040400000400000), CONST64(0x0040404000400000), CONST64(0x0040400000400040), CONST64(0x0040404000400040), 
+  CONST64(0x0040000000404000), CONST64(0x0040004000404000), CONST64(0x0040000000404040), CONST64(0x0040004000404040), 
+  CONST64(0x0040400000404000), CONST64(0x0040404000404000), CONST64(0x0040400000404040), CONST64(0x0040404000404040), 
+  CONST64(0x4000000000000000), CONST64(0x4000004000000000), CONST64(0x4000000000000040), CONST64(0x4000004000000040), 
+  CONST64(0x4000400000000000), CONST64(0x4000404000000000), CONST64(0x4000400000000040), CONST64(0x4000404000000040), 
+  CONST64(0x4000000000004000), CONST64(0x4000004000004000), CONST64(0x4000000000004040), CONST64(0x4000004000004040), 
+  CONST64(0x4000400000004000), CONST64(0x4000404000004000), CONST64(0x4000400000004040), CONST64(0x4000404000004040), 
+  CONST64(0x4040000000000000), CONST64(0x4040004000000000), CONST64(0x4040000000000040), CONST64(0x4040004000000040), 
+  CONST64(0x4040400000000000), CONST64(0x4040404000000000), CONST64(0x4040400000000040), CONST64(0x4040404000000040), 
+  CONST64(0x4040000000004000), CONST64(0x4040004000004000), CONST64(0x4040000000004040), CONST64(0x4040004000004040), 
+  CONST64(0x4040400000004000), CONST64(0x4040404000004000), CONST64(0x4040400000004040), CONST64(0x4040404000004040), 
+  CONST64(0x4000000000400000), CONST64(0x4000004000400000), CONST64(0x4000000000400040), CONST64(0x4000004000400040), 
+  CONST64(0x4000400000400000), CONST64(0x4000404000400000), CONST64(0x4000400000400040), CONST64(0x4000404000400040), 
+  CONST64(0x4000000000404000), CONST64(0x4000004000404000), CONST64(0x4000000000404040), CONST64(0x4000004000404040), 
+  CONST64(0x4000400000404000), CONST64(0x4000404000404000), CONST64(0x4000400000404040), CONST64(0x4000404000404040), 
+  CONST64(0x4040000000400000), CONST64(0x4040004000400000), CONST64(0x4040000000400040), CONST64(0x4040004000400040), 
+  CONST64(0x4040400000400000), CONST64(0x4040404000400000), CONST64(0x4040400000400040), CONST64(0x4040404000400040), 
+  CONST64(0x4040000000404000), CONST64(0x4040004000404000), CONST64(0x4040000000404040), CONST64(0x4040004000404040), 
+  CONST64(0x4040400000404000), CONST64(0x4040404000404000), CONST64(0x4040400000404040), CONST64(0x4040404000404040), 
+  CONST64(0x0000000040000000), CONST64(0x0000004040000000), CONST64(0x0000000040000040), CONST64(0x0000004040000040), 
+  CONST64(0x0000400040000000), CONST64(0x0000404040000000), CONST64(0x0000400040000040), CONST64(0x0000404040000040), 
+  CONST64(0x0000000040004000), CONST64(0x0000004040004000), CONST64(0x0000000040004040), CONST64(0x0000004040004040), 
+  CONST64(0x0000400040004000), CONST64(0x0000404040004000), CONST64(0x0000400040004040), CONST64(0x0000404040004040), 
+  CONST64(0x0040000040000000), CONST64(0x0040004040000000), CONST64(0x0040000040000040), CONST64(0x0040004040000040), 
+  CONST64(0x0040400040000000), CONST64(0x0040404040000000), CONST64(0x0040400040000040), CONST64(0x0040404040000040), 
+  CONST64(0x0040000040004000), CONST64(0x0040004040004000), CONST64(0x0040000040004040), CONST64(0x0040004040004040), 
+  CONST64(0x0040400040004000), CONST64(0x0040404040004000), CONST64(0x0040400040004040), CONST64(0x0040404040004040), 
+  CONST64(0x0000000040400000), CONST64(0x0000004040400000), CONST64(0x0000000040400040), CONST64(0x0000004040400040), 
+  CONST64(0x0000400040400000), CONST64(0x0000404040400000), CONST64(0x0000400040400040), CONST64(0x0000404040400040), 
+  CONST64(0x0000000040404000), CONST64(0x0000004040404000), CONST64(0x0000000040404040), CONST64(0x0000004040404040), 
+  CONST64(0x0000400040404000), CONST64(0x0000404040404000), CONST64(0x0000400040404040), CONST64(0x0000404040404040), 
+  CONST64(0x0040000040400000), CONST64(0x0040004040400000), CONST64(0x0040000040400040), CONST64(0x0040004040400040), 
+  CONST64(0x0040400040400000), CONST64(0x0040404040400000), CONST64(0x0040400040400040), CONST64(0x0040404040400040), 
+  CONST64(0x0040000040404000), CONST64(0x0040004040404000), CONST64(0x0040000040404040), CONST64(0x0040004040404040), 
+  CONST64(0x0040400040404000), CONST64(0x0040404040404000), CONST64(0x0040400040404040), CONST64(0x0040404040404040), 
+  CONST64(0x4000000040000000), CONST64(0x4000004040000000), CONST64(0x4000000040000040), CONST64(0x4000004040000040), 
+  CONST64(0x4000400040000000), CONST64(0x4000404040000000), CONST64(0x4000400040000040), CONST64(0x4000404040000040), 
+  CONST64(0x4000000040004000), CONST64(0x4000004040004000), CONST64(0x4000000040004040), CONST64(0x4000004040004040), 
+  CONST64(0x4000400040004000), CONST64(0x4000404040004000), CONST64(0x4000400040004040), CONST64(0x4000404040004040), 
+  CONST64(0x4040000040000000), CONST64(0x4040004040000000), CONST64(0x4040000040000040), CONST64(0x4040004040000040), 
+  CONST64(0x4040400040000000), CONST64(0x4040404040000000), CONST64(0x4040400040000040), CONST64(0x4040404040000040), 
+  CONST64(0x4040000040004000), CONST64(0x4040004040004000), CONST64(0x4040000040004040), CONST64(0x4040004040004040), 
+  CONST64(0x4040400040004000), CONST64(0x4040404040004000), CONST64(0x4040400040004040), CONST64(0x4040404040004040), 
+  CONST64(0x4000000040400000), CONST64(0x4000004040400000), CONST64(0x4000000040400040), CONST64(0x4000004040400040), 
+  CONST64(0x4000400040400000), CONST64(0x4000404040400000), CONST64(0x4000400040400040), CONST64(0x4000404040400040), 
+  CONST64(0x4000000040404000), CONST64(0x4000004040404000), CONST64(0x4000000040404040), CONST64(0x4000004040404040), 
+  CONST64(0x4000400040404000), CONST64(0x4000404040404000), CONST64(0x4000400040404040), CONST64(0x4000404040404040), 
+  CONST64(0x4040000040400000), CONST64(0x4040004040400000), CONST64(0x4040000040400040), CONST64(0x4040004040400040), 
+  CONST64(0x4040400040400000), CONST64(0x4040404040400000), CONST64(0x4040400040400040), CONST64(0x4040404040400040), 
+  CONST64(0x4040000040404000), CONST64(0x4040004040404000), CONST64(0x4040000040404040), CONST64(0x4040004040404040), 
+  CONST64(0x4040400040404000), CONST64(0x4040404040404000), CONST64(0x4040400040404040), CONST64(0x4040404040404040)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000002000000000), CONST64(0x0000000000000020), CONST64(0x0000002000000020), 
+  CONST64(0x0000200000000000), CONST64(0x0000202000000000), CONST64(0x0000200000000020), CONST64(0x0000202000000020), 
+  CONST64(0x0000000000002000), CONST64(0x0000002000002000), CONST64(0x0000000000002020), CONST64(0x0000002000002020), 
+  CONST64(0x0000200000002000), CONST64(0x0000202000002000), CONST64(0x0000200000002020), CONST64(0x0000202000002020), 
+  CONST64(0x0020000000000000), CONST64(0x0020002000000000), CONST64(0x0020000000000020), CONST64(0x0020002000000020), 
+  CONST64(0x0020200000000000), CONST64(0x0020202000000000), CONST64(0x0020200000000020), CONST64(0x0020202000000020), 
+  CONST64(0x0020000000002000), CONST64(0x0020002000002000), CONST64(0x0020000000002020), CONST64(0x0020002000002020), 
+  CONST64(0x0020200000002000), CONST64(0x0020202000002000), CONST64(0x0020200000002020), CONST64(0x0020202000002020), 
+  CONST64(0x0000000000200000), CONST64(0x0000002000200000), CONST64(0x0000000000200020), CONST64(0x0000002000200020), 
+  CONST64(0x0000200000200000), CONST64(0x0000202000200000), CONST64(0x0000200000200020), CONST64(0x0000202000200020), 
+  CONST64(0x0000000000202000), CONST64(0x0000002000202000), CONST64(0x0000000000202020), CONST64(0x0000002000202020), 
+  CONST64(0x0000200000202000), CONST64(0x0000202000202000), CONST64(0x0000200000202020), CONST64(0x0000202000202020), 
+  CONST64(0x0020000000200000), CONST64(0x0020002000200000), CONST64(0x0020000000200020), CONST64(0x0020002000200020), 
+  CONST64(0x0020200000200000), CONST64(0x0020202000200000), CONST64(0x0020200000200020), CONST64(0x0020202000200020), 
+  CONST64(0x0020000000202000), CONST64(0x0020002000202000), CONST64(0x0020000000202020), CONST64(0x0020002000202020), 
+  CONST64(0x0020200000202000), CONST64(0x0020202000202000), CONST64(0x0020200000202020), CONST64(0x0020202000202020), 
+  CONST64(0x2000000000000000), CONST64(0x2000002000000000), CONST64(0x2000000000000020), CONST64(0x2000002000000020), 
+  CONST64(0x2000200000000000), CONST64(0x2000202000000000), CONST64(0x2000200000000020), CONST64(0x2000202000000020), 
+  CONST64(0x2000000000002000), CONST64(0x2000002000002000), CONST64(0x2000000000002020), CONST64(0x2000002000002020), 
+  CONST64(0x2000200000002000), CONST64(0x2000202000002000), CONST64(0x2000200000002020), CONST64(0x2000202000002020), 
+  CONST64(0x2020000000000000), CONST64(0x2020002000000000), CONST64(0x2020000000000020), CONST64(0x2020002000000020), 
+  CONST64(0x2020200000000000), CONST64(0x2020202000000000), CONST64(0x2020200000000020), CONST64(0x2020202000000020), 
+  CONST64(0x2020000000002000), CONST64(0x2020002000002000), CONST64(0x2020000000002020), CONST64(0x2020002000002020), 
+  CONST64(0x2020200000002000), CONST64(0x2020202000002000), CONST64(0x2020200000002020), CONST64(0x2020202000002020), 
+  CONST64(0x2000000000200000), CONST64(0x2000002000200000), CONST64(0x2000000000200020), CONST64(0x2000002000200020), 
+  CONST64(0x2000200000200000), CONST64(0x2000202000200000), CONST64(0x2000200000200020), CONST64(0x2000202000200020), 
+  CONST64(0x2000000000202000), CONST64(0x2000002000202000), CONST64(0x2000000000202020), CONST64(0x2000002000202020), 
+  CONST64(0x2000200000202000), CONST64(0x2000202000202000), CONST64(0x2000200000202020), CONST64(0x2000202000202020), 
+  CONST64(0x2020000000200000), CONST64(0x2020002000200000), CONST64(0x2020000000200020), CONST64(0x2020002000200020), 
+  CONST64(0x2020200000200000), CONST64(0x2020202000200000), CONST64(0x2020200000200020), CONST64(0x2020202000200020), 
+  CONST64(0x2020000000202000), CONST64(0x2020002000202000), CONST64(0x2020000000202020), CONST64(0x2020002000202020), 
+  CONST64(0x2020200000202000), CONST64(0x2020202000202000), CONST64(0x2020200000202020), CONST64(0x2020202000202020), 
+  CONST64(0x0000000020000000), CONST64(0x0000002020000000), CONST64(0x0000000020000020), CONST64(0x0000002020000020), 
+  CONST64(0x0000200020000000), CONST64(0x0000202020000000), CONST64(0x0000200020000020), CONST64(0x0000202020000020), 
+  CONST64(0x0000000020002000), CONST64(0x0000002020002000), CONST64(0x0000000020002020), CONST64(0x0000002020002020), 
+  CONST64(0x0000200020002000), CONST64(0x0000202020002000), CONST64(0x0000200020002020), CONST64(0x0000202020002020), 
+  CONST64(0x0020000020000000), CONST64(0x0020002020000000), CONST64(0x0020000020000020), CONST64(0x0020002020000020), 
+  CONST64(0x0020200020000000), CONST64(0x0020202020000000), CONST64(0x0020200020000020), CONST64(0x0020202020000020), 
+  CONST64(0x0020000020002000), CONST64(0x0020002020002000), CONST64(0x0020000020002020), CONST64(0x0020002020002020), 
+  CONST64(0x0020200020002000), CONST64(0x0020202020002000), CONST64(0x0020200020002020), CONST64(0x0020202020002020), 
+  CONST64(0x0000000020200000), CONST64(0x0000002020200000), CONST64(0x0000000020200020), CONST64(0x0000002020200020), 
+  CONST64(0x0000200020200000), CONST64(0x0000202020200000), CONST64(0x0000200020200020), CONST64(0x0000202020200020), 
+  CONST64(0x0000000020202000), CONST64(0x0000002020202000), CONST64(0x0000000020202020), CONST64(0x0000002020202020), 
+  CONST64(0x0000200020202000), CONST64(0x0000202020202000), CONST64(0x0000200020202020), CONST64(0x0000202020202020), 
+  CONST64(0x0020000020200000), CONST64(0x0020002020200000), CONST64(0x0020000020200020), CONST64(0x0020002020200020), 
+  CONST64(0x0020200020200000), CONST64(0x0020202020200000), CONST64(0x0020200020200020), CONST64(0x0020202020200020), 
+  CONST64(0x0020000020202000), CONST64(0x0020002020202000), CONST64(0x0020000020202020), CONST64(0x0020002020202020), 
+  CONST64(0x0020200020202000), CONST64(0x0020202020202000), CONST64(0x0020200020202020), CONST64(0x0020202020202020), 
+  CONST64(0x2000000020000000), CONST64(0x2000002020000000), CONST64(0x2000000020000020), CONST64(0x2000002020000020), 
+  CONST64(0x2000200020000000), CONST64(0x2000202020000000), CONST64(0x2000200020000020), CONST64(0x2000202020000020), 
+  CONST64(0x2000000020002000), CONST64(0x2000002020002000), CONST64(0x2000000020002020), CONST64(0x2000002020002020), 
+  CONST64(0x2000200020002000), CONST64(0x2000202020002000), CONST64(0x2000200020002020), CONST64(0x2000202020002020), 
+  CONST64(0x2020000020000000), CONST64(0x2020002020000000), CONST64(0x2020000020000020), CONST64(0x2020002020000020), 
+  CONST64(0x2020200020000000), CONST64(0x2020202020000000), CONST64(0x2020200020000020), CONST64(0x2020202020000020), 
+  CONST64(0x2020000020002000), CONST64(0x2020002020002000), CONST64(0x2020000020002020), CONST64(0x2020002020002020), 
+  CONST64(0x2020200020002000), CONST64(0x2020202020002000), CONST64(0x2020200020002020), CONST64(0x2020202020002020), 
+  CONST64(0x2000000020200000), CONST64(0x2000002020200000), CONST64(0x2000000020200020), CONST64(0x2000002020200020), 
+  CONST64(0x2000200020200000), CONST64(0x2000202020200000), CONST64(0x2000200020200020), CONST64(0x2000202020200020), 
+  CONST64(0x2000000020202000), CONST64(0x2000002020202000), CONST64(0x2000000020202020), CONST64(0x2000002020202020), 
+  CONST64(0x2000200020202000), CONST64(0x2000202020202000), CONST64(0x2000200020202020), CONST64(0x2000202020202020), 
+  CONST64(0x2020000020200000), CONST64(0x2020002020200000), CONST64(0x2020000020200020), CONST64(0x2020002020200020), 
+  CONST64(0x2020200020200000), CONST64(0x2020202020200000), CONST64(0x2020200020200020), CONST64(0x2020202020200020), 
+  CONST64(0x2020000020202000), CONST64(0x2020002020202000), CONST64(0x2020000020202020), CONST64(0x2020002020202020), 
+  CONST64(0x2020200020202000), CONST64(0x2020202020202000), CONST64(0x2020200020202020), CONST64(0x2020202020202020)
+  }};
+  
+static const ulong64 des_fp[8][256] = {
+
+{ CONST64(0x0000000000000000), CONST64(0x0000008000000000), CONST64(0x0000000002000000), CONST64(0x0000008002000000), 
+  CONST64(0x0000000000020000), CONST64(0x0000008000020000), CONST64(0x0000000002020000), CONST64(0x0000008002020000), 
+  CONST64(0x0000000000000200), CONST64(0x0000008000000200), CONST64(0x0000000002000200), CONST64(0x0000008002000200), 
+  CONST64(0x0000000000020200), CONST64(0x0000008000020200), CONST64(0x0000000002020200), CONST64(0x0000008002020200), 
+  CONST64(0x0000000000000002), CONST64(0x0000008000000002), CONST64(0x0000000002000002), CONST64(0x0000008002000002), 
+  CONST64(0x0000000000020002), CONST64(0x0000008000020002), CONST64(0x0000000002020002), CONST64(0x0000008002020002), 
+  CONST64(0x0000000000000202), CONST64(0x0000008000000202), CONST64(0x0000000002000202), CONST64(0x0000008002000202), 
+  CONST64(0x0000000000020202), CONST64(0x0000008000020202), CONST64(0x0000000002020202), CONST64(0x0000008002020202), 
+  CONST64(0x0200000000000000), CONST64(0x0200008000000000), CONST64(0x0200000002000000), CONST64(0x0200008002000000), 
+  CONST64(0x0200000000020000), CONST64(0x0200008000020000), CONST64(0x0200000002020000), CONST64(0x0200008002020000), 
+  CONST64(0x0200000000000200), CONST64(0x0200008000000200), CONST64(0x0200000002000200), CONST64(0x0200008002000200), 
+  CONST64(0x0200000000020200), CONST64(0x0200008000020200), CONST64(0x0200000002020200), CONST64(0x0200008002020200), 
+  CONST64(0x0200000000000002), CONST64(0x0200008000000002), CONST64(0x0200000002000002), CONST64(0x0200008002000002), 
+  CONST64(0x0200000000020002), CONST64(0x0200008000020002), CONST64(0x0200000002020002), CONST64(0x0200008002020002), 
+  CONST64(0x0200000000000202), CONST64(0x0200008000000202), CONST64(0x0200000002000202), CONST64(0x0200008002000202), 
+  CONST64(0x0200000000020202), CONST64(0x0200008000020202), CONST64(0x0200000002020202), CONST64(0x0200008002020202), 
+  CONST64(0x0002000000000000), CONST64(0x0002008000000000), CONST64(0x0002000002000000), CONST64(0x0002008002000000), 
+  CONST64(0x0002000000020000), CONST64(0x0002008000020000), CONST64(0x0002000002020000), CONST64(0x0002008002020000), 
+  CONST64(0x0002000000000200), CONST64(0x0002008000000200), CONST64(0x0002000002000200), CONST64(0x0002008002000200), 
+  CONST64(0x0002000000020200), CONST64(0x0002008000020200), CONST64(0x0002000002020200), CONST64(0x0002008002020200), 
+  CONST64(0x0002000000000002), CONST64(0x0002008000000002), CONST64(0x0002000002000002), CONST64(0x0002008002000002), 
+  CONST64(0x0002000000020002), CONST64(0x0002008000020002), CONST64(0x0002000002020002), CONST64(0x0002008002020002), 
+  CONST64(0x0002000000000202), CONST64(0x0002008000000202), CONST64(0x0002000002000202), CONST64(0x0002008002000202), 
+  CONST64(0x0002000000020202), CONST64(0x0002008000020202), CONST64(0x0002000002020202), CONST64(0x0002008002020202), 
+  CONST64(0x0202000000000000), CONST64(0x0202008000000000), CONST64(0x0202000002000000), CONST64(0x0202008002000000), 
+  CONST64(0x0202000000020000), CONST64(0x0202008000020000), CONST64(0x0202000002020000), CONST64(0x0202008002020000), 
+  CONST64(0x0202000000000200), CONST64(0x0202008000000200), CONST64(0x0202000002000200), CONST64(0x0202008002000200), 
+  CONST64(0x0202000000020200), CONST64(0x0202008000020200), CONST64(0x0202000002020200), CONST64(0x0202008002020200), 
+  CONST64(0x0202000000000002), CONST64(0x0202008000000002), CONST64(0x0202000002000002), CONST64(0x0202008002000002), 
+  CONST64(0x0202000000020002), CONST64(0x0202008000020002), CONST64(0x0202000002020002), CONST64(0x0202008002020002), 
+  CONST64(0x0202000000000202), CONST64(0x0202008000000202), CONST64(0x0202000002000202), CONST64(0x0202008002000202), 
+  CONST64(0x0202000000020202), CONST64(0x0202008000020202), CONST64(0x0202000002020202), CONST64(0x0202008002020202), 
+  CONST64(0x0000020000000000), CONST64(0x0000028000000000), CONST64(0x0000020002000000), CONST64(0x0000028002000000), 
+  CONST64(0x0000020000020000), CONST64(0x0000028000020000), CONST64(0x0000020002020000), CONST64(0x0000028002020000), 
+  CONST64(0x0000020000000200), CONST64(0x0000028000000200), CONST64(0x0000020002000200), CONST64(0x0000028002000200), 
+  CONST64(0x0000020000020200), CONST64(0x0000028000020200), CONST64(0x0000020002020200), CONST64(0x0000028002020200), 
+  CONST64(0x0000020000000002), CONST64(0x0000028000000002), CONST64(0x0000020002000002), CONST64(0x0000028002000002), 
+  CONST64(0x0000020000020002), CONST64(0x0000028000020002), CONST64(0x0000020002020002), CONST64(0x0000028002020002), 
+  CONST64(0x0000020000000202), CONST64(0x0000028000000202), CONST64(0x0000020002000202), CONST64(0x0000028002000202), 
+  CONST64(0x0000020000020202), CONST64(0x0000028000020202), CONST64(0x0000020002020202), CONST64(0x0000028002020202), 
+  CONST64(0x0200020000000000), CONST64(0x0200028000000000), CONST64(0x0200020002000000), CONST64(0x0200028002000000), 
+  CONST64(0x0200020000020000), CONST64(0x0200028000020000), CONST64(0x0200020002020000), CONST64(0x0200028002020000), 
+  CONST64(0x0200020000000200), CONST64(0x0200028000000200), CONST64(0x0200020002000200), CONST64(0x0200028002000200), 
+  CONST64(0x0200020000020200), CONST64(0x0200028000020200), CONST64(0x0200020002020200), CONST64(0x0200028002020200), 
+  CONST64(0x0200020000000002), CONST64(0x0200028000000002), CONST64(0x0200020002000002), CONST64(0x0200028002000002), 
+  CONST64(0x0200020000020002), CONST64(0x0200028000020002), CONST64(0x0200020002020002), CONST64(0x0200028002020002), 
+  CONST64(0x0200020000000202), CONST64(0x0200028000000202), CONST64(0x0200020002000202), CONST64(0x0200028002000202), 
+  CONST64(0x0200020000020202), CONST64(0x0200028000020202), CONST64(0x0200020002020202), CONST64(0x0200028002020202), 
+  CONST64(0x0002020000000000), CONST64(0x0002028000000000), CONST64(0x0002020002000000), CONST64(0x0002028002000000), 
+  CONST64(0x0002020000020000), CONST64(0x0002028000020000), CONST64(0x0002020002020000), CONST64(0x0002028002020000), 
+  CONST64(0x0002020000000200), CONST64(0x0002028000000200), CONST64(0x0002020002000200), CONST64(0x0002028002000200), 
+  CONST64(0x0002020000020200), CONST64(0x0002028000020200), CONST64(0x0002020002020200), CONST64(0x0002028002020200), 
+  CONST64(0x0002020000000002), CONST64(0x0002028000000002), CONST64(0x0002020002000002), CONST64(0x0002028002000002), 
+  CONST64(0x0002020000020002), CONST64(0x0002028000020002), CONST64(0x0002020002020002), CONST64(0x0002028002020002), 
+  CONST64(0x0002020000000202), CONST64(0x0002028000000202), CONST64(0x0002020002000202), CONST64(0x0002028002000202), 
+  CONST64(0x0002020000020202), CONST64(0x0002028000020202), CONST64(0x0002020002020202), CONST64(0x0002028002020202), 
+  CONST64(0x0202020000000000), CONST64(0x0202028000000000), CONST64(0x0202020002000000), CONST64(0x0202028002000000), 
+  CONST64(0x0202020000020000), CONST64(0x0202028000020000), CONST64(0x0202020002020000), CONST64(0x0202028002020000), 
+  CONST64(0x0202020000000200), CONST64(0x0202028000000200), CONST64(0x0202020002000200), CONST64(0x0202028002000200), 
+  CONST64(0x0202020000020200), CONST64(0x0202028000020200), CONST64(0x0202020002020200), CONST64(0x0202028002020200), 
+  CONST64(0x0202020000000002), CONST64(0x0202028000000002), CONST64(0x0202020002000002), CONST64(0x0202028002000002), 
+  CONST64(0x0202020000020002), CONST64(0x0202028000020002), CONST64(0x0202020002020002), CONST64(0x0202028002020002), 
+  CONST64(0x0202020000000202), CONST64(0x0202028000000202), CONST64(0x0202020002000202), CONST64(0x0202028002000202), 
+  CONST64(0x0202020000020202), CONST64(0x0202028000020202), CONST64(0x0202020002020202), CONST64(0x0202028002020202)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000200000000), CONST64(0x0000000008000000), CONST64(0x0000000208000000), 
+  CONST64(0x0000000000080000), CONST64(0x0000000200080000), CONST64(0x0000000008080000), CONST64(0x0000000208080000), 
+  CONST64(0x0000000000000800), CONST64(0x0000000200000800), CONST64(0x0000000008000800), CONST64(0x0000000208000800), 
+  CONST64(0x0000000000080800), CONST64(0x0000000200080800), CONST64(0x0000000008080800), CONST64(0x0000000208080800), 
+  CONST64(0x0000000000000008), CONST64(0x0000000200000008), CONST64(0x0000000008000008), CONST64(0x0000000208000008), 
+  CONST64(0x0000000000080008), CONST64(0x0000000200080008), CONST64(0x0000000008080008), CONST64(0x0000000208080008), 
+  CONST64(0x0000000000000808), CONST64(0x0000000200000808), CONST64(0x0000000008000808), CONST64(0x0000000208000808), 
+  CONST64(0x0000000000080808), CONST64(0x0000000200080808), CONST64(0x0000000008080808), CONST64(0x0000000208080808), 
+  CONST64(0x0800000000000000), CONST64(0x0800000200000000), CONST64(0x0800000008000000), CONST64(0x0800000208000000), 
+  CONST64(0x0800000000080000), CONST64(0x0800000200080000), CONST64(0x0800000008080000), CONST64(0x0800000208080000), 
+  CONST64(0x0800000000000800), CONST64(0x0800000200000800), CONST64(0x0800000008000800), CONST64(0x0800000208000800), 
+  CONST64(0x0800000000080800), CONST64(0x0800000200080800), CONST64(0x0800000008080800), CONST64(0x0800000208080800), 
+  CONST64(0x0800000000000008), CONST64(0x0800000200000008), CONST64(0x0800000008000008), CONST64(0x0800000208000008), 
+  CONST64(0x0800000000080008), CONST64(0x0800000200080008), CONST64(0x0800000008080008), CONST64(0x0800000208080008), 
+  CONST64(0x0800000000000808), CONST64(0x0800000200000808), CONST64(0x0800000008000808), CONST64(0x0800000208000808), 
+  CONST64(0x0800000000080808), CONST64(0x0800000200080808), CONST64(0x0800000008080808), CONST64(0x0800000208080808), 
+  CONST64(0x0008000000000000), CONST64(0x0008000200000000), CONST64(0x0008000008000000), CONST64(0x0008000208000000), 
+  CONST64(0x0008000000080000), CONST64(0x0008000200080000), CONST64(0x0008000008080000), CONST64(0x0008000208080000), 
+  CONST64(0x0008000000000800), CONST64(0x0008000200000800), CONST64(0x0008000008000800), CONST64(0x0008000208000800), 
+  CONST64(0x0008000000080800), CONST64(0x0008000200080800), CONST64(0x0008000008080800), CONST64(0x0008000208080800), 
+  CONST64(0x0008000000000008), CONST64(0x0008000200000008), CONST64(0x0008000008000008), CONST64(0x0008000208000008), 
+  CONST64(0x0008000000080008), CONST64(0x0008000200080008), CONST64(0x0008000008080008), CONST64(0x0008000208080008), 
+  CONST64(0x0008000000000808), CONST64(0x0008000200000808), CONST64(0x0008000008000808), CONST64(0x0008000208000808), 
+  CONST64(0x0008000000080808), CONST64(0x0008000200080808), CONST64(0x0008000008080808), CONST64(0x0008000208080808), 
+  CONST64(0x0808000000000000), CONST64(0x0808000200000000), CONST64(0x0808000008000000), CONST64(0x0808000208000000), 
+  CONST64(0x0808000000080000), CONST64(0x0808000200080000), CONST64(0x0808000008080000), CONST64(0x0808000208080000), 
+  CONST64(0x0808000000000800), CONST64(0x0808000200000800), CONST64(0x0808000008000800), CONST64(0x0808000208000800), 
+  CONST64(0x0808000000080800), CONST64(0x0808000200080800), CONST64(0x0808000008080800), CONST64(0x0808000208080800), 
+  CONST64(0x0808000000000008), CONST64(0x0808000200000008), CONST64(0x0808000008000008), CONST64(0x0808000208000008), 
+  CONST64(0x0808000000080008), CONST64(0x0808000200080008), CONST64(0x0808000008080008), CONST64(0x0808000208080008), 
+  CONST64(0x0808000000000808), CONST64(0x0808000200000808), CONST64(0x0808000008000808), CONST64(0x0808000208000808), 
+  CONST64(0x0808000000080808), CONST64(0x0808000200080808), CONST64(0x0808000008080808), CONST64(0x0808000208080808), 
+  CONST64(0x0000080000000000), CONST64(0x0000080200000000), CONST64(0x0000080008000000), CONST64(0x0000080208000000), 
+  CONST64(0x0000080000080000), CONST64(0x0000080200080000), CONST64(0x0000080008080000), CONST64(0x0000080208080000), 
+  CONST64(0x0000080000000800), CONST64(0x0000080200000800), CONST64(0x0000080008000800), CONST64(0x0000080208000800), 
+  CONST64(0x0000080000080800), CONST64(0x0000080200080800), CONST64(0x0000080008080800), CONST64(0x0000080208080800), 
+  CONST64(0x0000080000000008), CONST64(0x0000080200000008), CONST64(0x0000080008000008), CONST64(0x0000080208000008), 
+  CONST64(0x0000080000080008), CONST64(0x0000080200080008), CONST64(0x0000080008080008), CONST64(0x0000080208080008), 
+  CONST64(0x0000080000000808), CONST64(0x0000080200000808), CONST64(0x0000080008000808), CONST64(0x0000080208000808), 
+  CONST64(0x0000080000080808), CONST64(0x0000080200080808), CONST64(0x0000080008080808), CONST64(0x0000080208080808), 
+  CONST64(0x0800080000000000), CONST64(0x0800080200000000), CONST64(0x0800080008000000), CONST64(0x0800080208000000), 
+  CONST64(0x0800080000080000), CONST64(0x0800080200080000), CONST64(0x0800080008080000), CONST64(0x0800080208080000), 
+  CONST64(0x0800080000000800), CONST64(0x0800080200000800), CONST64(0x0800080008000800), CONST64(0x0800080208000800), 
+  CONST64(0x0800080000080800), CONST64(0x0800080200080800), CONST64(0x0800080008080800), CONST64(0x0800080208080800), 
+  CONST64(0x0800080000000008), CONST64(0x0800080200000008), CONST64(0x0800080008000008), CONST64(0x0800080208000008), 
+  CONST64(0x0800080000080008), CONST64(0x0800080200080008), CONST64(0x0800080008080008), CONST64(0x0800080208080008), 
+  CONST64(0x0800080000000808), CONST64(0x0800080200000808), CONST64(0x0800080008000808), CONST64(0x0800080208000808), 
+  CONST64(0x0800080000080808), CONST64(0x0800080200080808), CONST64(0x0800080008080808), CONST64(0x0800080208080808), 
+  CONST64(0x0008080000000000), CONST64(0x0008080200000000), CONST64(0x0008080008000000), CONST64(0x0008080208000000), 
+  CONST64(0x0008080000080000), CONST64(0x0008080200080000), CONST64(0x0008080008080000), CONST64(0x0008080208080000), 
+  CONST64(0x0008080000000800), CONST64(0x0008080200000800), CONST64(0x0008080008000800), CONST64(0x0008080208000800), 
+  CONST64(0x0008080000080800), CONST64(0x0008080200080800), CONST64(0x0008080008080800), CONST64(0x0008080208080800), 
+  CONST64(0x0008080000000008), CONST64(0x0008080200000008), CONST64(0x0008080008000008), CONST64(0x0008080208000008), 
+  CONST64(0x0008080000080008), CONST64(0x0008080200080008), CONST64(0x0008080008080008), CONST64(0x0008080208080008), 
+  CONST64(0x0008080000000808), CONST64(0x0008080200000808), CONST64(0x0008080008000808), CONST64(0x0008080208000808), 
+  CONST64(0x0008080000080808), CONST64(0x0008080200080808), CONST64(0x0008080008080808), CONST64(0x0008080208080808), 
+  CONST64(0x0808080000000000), CONST64(0x0808080200000000), CONST64(0x0808080008000000), CONST64(0x0808080208000000), 
+  CONST64(0x0808080000080000), CONST64(0x0808080200080000), CONST64(0x0808080008080000), CONST64(0x0808080208080000), 
+  CONST64(0x0808080000000800), CONST64(0x0808080200000800), CONST64(0x0808080008000800), CONST64(0x0808080208000800), 
+  CONST64(0x0808080000080800), CONST64(0x0808080200080800), CONST64(0x0808080008080800), CONST64(0x0808080208080800), 
+  CONST64(0x0808080000000008), CONST64(0x0808080200000008), CONST64(0x0808080008000008), CONST64(0x0808080208000008), 
+  CONST64(0x0808080000080008), CONST64(0x0808080200080008), CONST64(0x0808080008080008), CONST64(0x0808080208080008), 
+  CONST64(0x0808080000000808), CONST64(0x0808080200000808), CONST64(0x0808080008000808), CONST64(0x0808080208000808), 
+  CONST64(0x0808080000080808), CONST64(0x0808080200080808), CONST64(0x0808080008080808), CONST64(0x0808080208080808)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000800000000), CONST64(0x0000000020000000), CONST64(0x0000000820000000), 
+  CONST64(0x0000000000200000), CONST64(0x0000000800200000), CONST64(0x0000000020200000), CONST64(0x0000000820200000), 
+  CONST64(0x0000000000002000), CONST64(0x0000000800002000), CONST64(0x0000000020002000), CONST64(0x0000000820002000), 
+  CONST64(0x0000000000202000), CONST64(0x0000000800202000), CONST64(0x0000000020202000), CONST64(0x0000000820202000), 
+  CONST64(0x0000000000000020), CONST64(0x0000000800000020), CONST64(0x0000000020000020), CONST64(0x0000000820000020), 
+  CONST64(0x0000000000200020), CONST64(0x0000000800200020), CONST64(0x0000000020200020), CONST64(0x0000000820200020), 
+  CONST64(0x0000000000002020), CONST64(0x0000000800002020), CONST64(0x0000000020002020), CONST64(0x0000000820002020), 
+  CONST64(0x0000000000202020), CONST64(0x0000000800202020), CONST64(0x0000000020202020), CONST64(0x0000000820202020), 
+  CONST64(0x2000000000000000), CONST64(0x2000000800000000), CONST64(0x2000000020000000), CONST64(0x2000000820000000), 
+  CONST64(0x2000000000200000), CONST64(0x2000000800200000), CONST64(0x2000000020200000), CONST64(0x2000000820200000), 
+  CONST64(0x2000000000002000), CONST64(0x2000000800002000), CONST64(0x2000000020002000), CONST64(0x2000000820002000), 
+  CONST64(0x2000000000202000), CONST64(0x2000000800202000), CONST64(0x2000000020202000), CONST64(0x2000000820202000), 
+  CONST64(0x2000000000000020), CONST64(0x2000000800000020), CONST64(0x2000000020000020), CONST64(0x2000000820000020), 
+  CONST64(0x2000000000200020), CONST64(0x2000000800200020), CONST64(0x2000000020200020), CONST64(0x2000000820200020), 
+  CONST64(0x2000000000002020), CONST64(0x2000000800002020), CONST64(0x2000000020002020), CONST64(0x2000000820002020), 
+  CONST64(0x2000000000202020), CONST64(0x2000000800202020), CONST64(0x2000000020202020), CONST64(0x2000000820202020), 
+  CONST64(0x0020000000000000), CONST64(0x0020000800000000), CONST64(0x0020000020000000), CONST64(0x0020000820000000), 
+  CONST64(0x0020000000200000), CONST64(0x0020000800200000), CONST64(0x0020000020200000), CONST64(0x0020000820200000), 
+  CONST64(0x0020000000002000), CONST64(0x0020000800002000), CONST64(0x0020000020002000), CONST64(0x0020000820002000), 
+  CONST64(0x0020000000202000), CONST64(0x0020000800202000), CONST64(0x0020000020202000), CONST64(0x0020000820202000), 
+  CONST64(0x0020000000000020), CONST64(0x0020000800000020), CONST64(0x0020000020000020), CONST64(0x0020000820000020), 
+  CONST64(0x0020000000200020), CONST64(0x0020000800200020), CONST64(0x0020000020200020), CONST64(0x0020000820200020), 
+  CONST64(0x0020000000002020), CONST64(0x0020000800002020), CONST64(0x0020000020002020), CONST64(0x0020000820002020), 
+  CONST64(0x0020000000202020), CONST64(0x0020000800202020), CONST64(0x0020000020202020), CONST64(0x0020000820202020), 
+  CONST64(0x2020000000000000), CONST64(0x2020000800000000), CONST64(0x2020000020000000), CONST64(0x2020000820000000), 
+  CONST64(0x2020000000200000), CONST64(0x2020000800200000), CONST64(0x2020000020200000), CONST64(0x2020000820200000), 
+  CONST64(0x2020000000002000), CONST64(0x2020000800002000), CONST64(0x2020000020002000), CONST64(0x2020000820002000), 
+  CONST64(0x2020000000202000), CONST64(0x2020000800202000), CONST64(0x2020000020202000), CONST64(0x2020000820202000), 
+  CONST64(0x2020000000000020), CONST64(0x2020000800000020), CONST64(0x2020000020000020), CONST64(0x2020000820000020), 
+  CONST64(0x2020000000200020), CONST64(0x2020000800200020), CONST64(0x2020000020200020), CONST64(0x2020000820200020), 
+  CONST64(0x2020000000002020), CONST64(0x2020000800002020), CONST64(0x2020000020002020), CONST64(0x2020000820002020), 
+  CONST64(0x2020000000202020), CONST64(0x2020000800202020), CONST64(0x2020000020202020), CONST64(0x2020000820202020), 
+  CONST64(0x0000200000000000), CONST64(0x0000200800000000), CONST64(0x0000200020000000), CONST64(0x0000200820000000), 
+  CONST64(0x0000200000200000), CONST64(0x0000200800200000), CONST64(0x0000200020200000), CONST64(0x0000200820200000), 
+  CONST64(0x0000200000002000), CONST64(0x0000200800002000), CONST64(0x0000200020002000), CONST64(0x0000200820002000), 
+  CONST64(0x0000200000202000), CONST64(0x0000200800202000), CONST64(0x0000200020202000), CONST64(0x0000200820202000), 
+  CONST64(0x0000200000000020), CONST64(0x0000200800000020), CONST64(0x0000200020000020), CONST64(0x0000200820000020), 
+  CONST64(0x0000200000200020), CONST64(0x0000200800200020), CONST64(0x0000200020200020), CONST64(0x0000200820200020), 
+  CONST64(0x0000200000002020), CONST64(0x0000200800002020), CONST64(0x0000200020002020), CONST64(0x0000200820002020), 
+  CONST64(0x0000200000202020), CONST64(0x0000200800202020), CONST64(0x0000200020202020), CONST64(0x0000200820202020), 
+  CONST64(0x2000200000000000), CONST64(0x2000200800000000), CONST64(0x2000200020000000), CONST64(0x2000200820000000), 
+  CONST64(0x2000200000200000), CONST64(0x2000200800200000), CONST64(0x2000200020200000), CONST64(0x2000200820200000), 
+  CONST64(0x2000200000002000), CONST64(0x2000200800002000), CONST64(0x2000200020002000), CONST64(0x2000200820002000), 
+  CONST64(0x2000200000202000), CONST64(0x2000200800202000), CONST64(0x2000200020202000), CONST64(0x2000200820202000), 
+  CONST64(0x2000200000000020), CONST64(0x2000200800000020), CONST64(0x2000200020000020), CONST64(0x2000200820000020), 
+  CONST64(0x2000200000200020), CONST64(0x2000200800200020), CONST64(0x2000200020200020), CONST64(0x2000200820200020), 
+  CONST64(0x2000200000002020), CONST64(0x2000200800002020), CONST64(0x2000200020002020), CONST64(0x2000200820002020), 
+  CONST64(0x2000200000202020), CONST64(0x2000200800202020), CONST64(0x2000200020202020), CONST64(0x2000200820202020), 
+  CONST64(0x0020200000000000), CONST64(0x0020200800000000), CONST64(0x0020200020000000), CONST64(0x0020200820000000), 
+  CONST64(0x0020200000200000), CONST64(0x0020200800200000), CONST64(0x0020200020200000), CONST64(0x0020200820200000), 
+  CONST64(0x0020200000002000), CONST64(0x0020200800002000), CONST64(0x0020200020002000), CONST64(0x0020200820002000), 
+  CONST64(0x0020200000202000), CONST64(0x0020200800202000), CONST64(0x0020200020202000), CONST64(0x0020200820202000), 
+  CONST64(0x0020200000000020), CONST64(0x0020200800000020), CONST64(0x0020200020000020), CONST64(0x0020200820000020), 
+  CONST64(0x0020200000200020), CONST64(0x0020200800200020), CONST64(0x0020200020200020), CONST64(0x0020200820200020), 
+  CONST64(0x0020200000002020), CONST64(0x0020200800002020), CONST64(0x0020200020002020), CONST64(0x0020200820002020), 
+  CONST64(0x0020200000202020), CONST64(0x0020200800202020), CONST64(0x0020200020202020), CONST64(0x0020200820202020), 
+  CONST64(0x2020200000000000), CONST64(0x2020200800000000), CONST64(0x2020200020000000), CONST64(0x2020200820000000), 
+  CONST64(0x2020200000200000), CONST64(0x2020200800200000), CONST64(0x2020200020200000), CONST64(0x2020200820200000), 
+  CONST64(0x2020200000002000), CONST64(0x2020200800002000), CONST64(0x2020200020002000), CONST64(0x2020200820002000), 
+  CONST64(0x2020200000202000), CONST64(0x2020200800202000), CONST64(0x2020200020202000), CONST64(0x2020200820202000), 
+  CONST64(0x2020200000000020), CONST64(0x2020200800000020), CONST64(0x2020200020000020), CONST64(0x2020200820000020), 
+  CONST64(0x2020200000200020), CONST64(0x2020200800200020), CONST64(0x2020200020200020), CONST64(0x2020200820200020), 
+  CONST64(0x2020200000002020), CONST64(0x2020200800002020), CONST64(0x2020200020002020), CONST64(0x2020200820002020), 
+  CONST64(0x2020200000202020), CONST64(0x2020200800202020), CONST64(0x2020200020202020), CONST64(0x2020200820202020)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000002000000000), CONST64(0x0000000080000000), CONST64(0x0000002080000000), 
+  CONST64(0x0000000000800000), CONST64(0x0000002000800000), CONST64(0x0000000080800000), CONST64(0x0000002080800000), 
+  CONST64(0x0000000000008000), CONST64(0x0000002000008000), CONST64(0x0000000080008000), CONST64(0x0000002080008000), 
+  CONST64(0x0000000000808000), CONST64(0x0000002000808000), CONST64(0x0000000080808000), CONST64(0x0000002080808000), 
+  CONST64(0x0000000000000080), CONST64(0x0000002000000080), CONST64(0x0000000080000080), CONST64(0x0000002080000080), 
+  CONST64(0x0000000000800080), CONST64(0x0000002000800080), CONST64(0x0000000080800080), CONST64(0x0000002080800080), 
+  CONST64(0x0000000000008080), CONST64(0x0000002000008080), CONST64(0x0000000080008080), CONST64(0x0000002080008080), 
+  CONST64(0x0000000000808080), CONST64(0x0000002000808080), CONST64(0x0000000080808080), CONST64(0x0000002080808080), 
+  CONST64(0x8000000000000000), CONST64(0x8000002000000000), CONST64(0x8000000080000000), CONST64(0x8000002080000000), 
+  CONST64(0x8000000000800000), CONST64(0x8000002000800000), CONST64(0x8000000080800000), CONST64(0x8000002080800000), 
+  CONST64(0x8000000000008000), CONST64(0x8000002000008000), CONST64(0x8000000080008000), CONST64(0x8000002080008000), 
+  CONST64(0x8000000000808000), CONST64(0x8000002000808000), CONST64(0x8000000080808000), CONST64(0x8000002080808000), 
+  CONST64(0x8000000000000080), CONST64(0x8000002000000080), CONST64(0x8000000080000080), CONST64(0x8000002080000080), 
+  CONST64(0x8000000000800080), CONST64(0x8000002000800080), CONST64(0x8000000080800080), CONST64(0x8000002080800080), 
+  CONST64(0x8000000000008080), CONST64(0x8000002000008080), CONST64(0x8000000080008080), CONST64(0x8000002080008080), 
+  CONST64(0x8000000000808080), CONST64(0x8000002000808080), CONST64(0x8000000080808080), CONST64(0x8000002080808080), 
+  CONST64(0x0080000000000000), CONST64(0x0080002000000000), CONST64(0x0080000080000000), CONST64(0x0080002080000000), 
+  CONST64(0x0080000000800000), CONST64(0x0080002000800000), CONST64(0x0080000080800000), CONST64(0x0080002080800000), 
+  CONST64(0x0080000000008000), CONST64(0x0080002000008000), CONST64(0x0080000080008000), CONST64(0x0080002080008000), 
+  CONST64(0x0080000000808000), CONST64(0x0080002000808000), CONST64(0x0080000080808000), CONST64(0x0080002080808000), 
+  CONST64(0x0080000000000080), CONST64(0x0080002000000080), CONST64(0x0080000080000080), CONST64(0x0080002080000080), 
+  CONST64(0x0080000000800080), CONST64(0x0080002000800080), CONST64(0x0080000080800080), CONST64(0x0080002080800080), 
+  CONST64(0x0080000000008080), CONST64(0x0080002000008080), CONST64(0x0080000080008080), CONST64(0x0080002080008080), 
+  CONST64(0x0080000000808080), CONST64(0x0080002000808080), CONST64(0x0080000080808080), CONST64(0x0080002080808080), 
+  CONST64(0x8080000000000000), CONST64(0x8080002000000000), CONST64(0x8080000080000000), CONST64(0x8080002080000000), 
+  CONST64(0x8080000000800000), CONST64(0x8080002000800000), CONST64(0x8080000080800000), CONST64(0x8080002080800000), 
+  CONST64(0x8080000000008000), CONST64(0x8080002000008000), CONST64(0x8080000080008000), CONST64(0x8080002080008000), 
+  CONST64(0x8080000000808000), CONST64(0x8080002000808000), CONST64(0x8080000080808000), CONST64(0x8080002080808000), 
+  CONST64(0x8080000000000080), CONST64(0x8080002000000080), CONST64(0x8080000080000080), CONST64(0x8080002080000080), 
+  CONST64(0x8080000000800080), CONST64(0x8080002000800080), CONST64(0x8080000080800080), CONST64(0x8080002080800080), 
+  CONST64(0x8080000000008080), CONST64(0x8080002000008080), CONST64(0x8080000080008080), CONST64(0x8080002080008080), 
+  CONST64(0x8080000000808080), CONST64(0x8080002000808080), CONST64(0x8080000080808080), CONST64(0x8080002080808080), 
+  CONST64(0x0000800000000000), CONST64(0x0000802000000000), CONST64(0x0000800080000000), CONST64(0x0000802080000000), 
+  CONST64(0x0000800000800000), CONST64(0x0000802000800000), CONST64(0x0000800080800000), CONST64(0x0000802080800000), 
+  CONST64(0x0000800000008000), CONST64(0x0000802000008000), CONST64(0x0000800080008000), CONST64(0x0000802080008000), 
+  CONST64(0x0000800000808000), CONST64(0x0000802000808000), CONST64(0x0000800080808000), CONST64(0x0000802080808000), 
+  CONST64(0x0000800000000080), CONST64(0x0000802000000080), CONST64(0x0000800080000080), CONST64(0x0000802080000080), 
+  CONST64(0x0000800000800080), CONST64(0x0000802000800080), CONST64(0x0000800080800080), CONST64(0x0000802080800080), 
+  CONST64(0x0000800000008080), CONST64(0x0000802000008080), CONST64(0x0000800080008080), CONST64(0x0000802080008080), 
+  CONST64(0x0000800000808080), CONST64(0x0000802000808080), CONST64(0x0000800080808080), CONST64(0x0000802080808080), 
+  CONST64(0x8000800000000000), CONST64(0x8000802000000000), CONST64(0x8000800080000000), CONST64(0x8000802080000000), 
+  CONST64(0x8000800000800000), CONST64(0x8000802000800000), CONST64(0x8000800080800000), CONST64(0x8000802080800000), 
+  CONST64(0x8000800000008000), CONST64(0x8000802000008000), CONST64(0x8000800080008000), CONST64(0x8000802080008000), 
+  CONST64(0x8000800000808000), CONST64(0x8000802000808000), CONST64(0x8000800080808000), CONST64(0x8000802080808000), 
+  CONST64(0x8000800000000080), CONST64(0x8000802000000080), CONST64(0x8000800080000080), CONST64(0x8000802080000080), 
+  CONST64(0x8000800000800080), CONST64(0x8000802000800080), CONST64(0x8000800080800080), CONST64(0x8000802080800080), 
+  CONST64(0x8000800000008080), CONST64(0x8000802000008080), CONST64(0x8000800080008080), CONST64(0x8000802080008080), 
+  CONST64(0x8000800000808080), CONST64(0x8000802000808080), CONST64(0x8000800080808080), CONST64(0x8000802080808080), 
+  CONST64(0x0080800000000000), CONST64(0x0080802000000000), CONST64(0x0080800080000000), CONST64(0x0080802080000000), 
+  CONST64(0x0080800000800000), CONST64(0x0080802000800000), CONST64(0x0080800080800000), CONST64(0x0080802080800000), 
+  CONST64(0x0080800000008000), CONST64(0x0080802000008000), CONST64(0x0080800080008000), CONST64(0x0080802080008000), 
+  CONST64(0x0080800000808000), CONST64(0x0080802000808000), CONST64(0x0080800080808000), CONST64(0x0080802080808000), 
+  CONST64(0x0080800000000080), CONST64(0x0080802000000080), CONST64(0x0080800080000080), CONST64(0x0080802080000080), 
+  CONST64(0x0080800000800080), CONST64(0x0080802000800080), CONST64(0x0080800080800080), CONST64(0x0080802080800080), 
+  CONST64(0x0080800000008080), CONST64(0x0080802000008080), CONST64(0x0080800080008080), CONST64(0x0080802080008080), 
+  CONST64(0x0080800000808080), CONST64(0x0080802000808080), CONST64(0x0080800080808080), CONST64(0x0080802080808080), 
+  CONST64(0x8080800000000000), CONST64(0x8080802000000000), CONST64(0x8080800080000000), CONST64(0x8080802080000000), 
+  CONST64(0x8080800000800000), CONST64(0x8080802000800000), CONST64(0x8080800080800000), CONST64(0x8080802080800000), 
+  CONST64(0x8080800000008000), CONST64(0x8080802000008000), CONST64(0x8080800080008000), CONST64(0x8080802080008000), 
+  CONST64(0x8080800000808000), CONST64(0x8080802000808000), CONST64(0x8080800080808000), CONST64(0x8080802080808000), 
+  CONST64(0x8080800000000080), CONST64(0x8080802000000080), CONST64(0x8080800080000080), CONST64(0x8080802080000080), 
+  CONST64(0x8080800000800080), CONST64(0x8080802000800080), CONST64(0x8080800080800080), CONST64(0x8080802080800080), 
+  CONST64(0x8080800000008080), CONST64(0x8080802000008080), CONST64(0x8080800080008080), CONST64(0x8080802080008080), 
+  CONST64(0x8080800000808080), CONST64(0x8080802000808080), CONST64(0x8080800080808080), CONST64(0x8080802080808080)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000004000000000), CONST64(0x0000000001000000), CONST64(0x0000004001000000), 
+  CONST64(0x0000000000010000), CONST64(0x0000004000010000), CONST64(0x0000000001010000), CONST64(0x0000004001010000), 
+  CONST64(0x0000000000000100), CONST64(0x0000004000000100), CONST64(0x0000000001000100), CONST64(0x0000004001000100), 
+  CONST64(0x0000000000010100), CONST64(0x0000004000010100), CONST64(0x0000000001010100), CONST64(0x0000004001010100), 
+  CONST64(0x0000000000000001), CONST64(0x0000004000000001), CONST64(0x0000000001000001), CONST64(0x0000004001000001), 
+  CONST64(0x0000000000010001), CONST64(0x0000004000010001), CONST64(0x0000000001010001), CONST64(0x0000004001010001), 
+  CONST64(0x0000000000000101), CONST64(0x0000004000000101), CONST64(0x0000000001000101), CONST64(0x0000004001000101), 
+  CONST64(0x0000000000010101), CONST64(0x0000004000010101), CONST64(0x0000000001010101), CONST64(0x0000004001010101), 
+  CONST64(0x0100000000000000), CONST64(0x0100004000000000), CONST64(0x0100000001000000), CONST64(0x0100004001000000), 
+  CONST64(0x0100000000010000), CONST64(0x0100004000010000), CONST64(0x0100000001010000), CONST64(0x0100004001010000), 
+  CONST64(0x0100000000000100), CONST64(0x0100004000000100), CONST64(0x0100000001000100), CONST64(0x0100004001000100), 
+  CONST64(0x0100000000010100), CONST64(0x0100004000010100), CONST64(0x0100000001010100), CONST64(0x0100004001010100), 
+  CONST64(0x0100000000000001), CONST64(0x0100004000000001), CONST64(0x0100000001000001), CONST64(0x0100004001000001), 
+  CONST64(0x0100000000010001), CONST64(0x0100004000010001), CONST64(0x0100000001010001), CONST64(0x0100004001010001), 
+  CONST64(0x0100000000000101), CONST64(0x0100004000000101), CONST64(0x0100000001000101), CONST64(0x0100004001000101), 
+  CONST64(0x0100000000010101), CONST64(0x0100004000010101), CONST64(0x0100000001010101), CONST64(0x0100004001010101), 
+  CONST64(0x0001000000000000), CONST64(0x0001004000000000), CONST64(0x0001000001000000), CONST64(0x0001004001000000), 
+  CONST64(0x0001000000010000), CONST64(0x0001004000010000), CONST64(0x0001000001010000), CONST64(0x0001004001010000), 
+  CONST64(0x0001000000000100), CONST64(0x0001004000000100), CONST64(0x0001000001000100), CONST64(0x0001004001000100), 
+  CONST64(0x0001000000010100), CONST64(0x0001004000010100), CONST64(0x0001000001010100), CONST64(0x0001004001010100), 
+  CONST64(0x0001000000000001), CONST64(0x0001004000000001), CONST64(0x0001000001000001), CONST64(0x0001004001000001), 
+  CONST64(0x0001000000010001), CONST64(0x0001004000010001), CONST64(0x0001000001010001), CONST64(0x0001004001010001), 
+  CONST64(0x0001000000000101), CONST64(0x0001004000000101), CONST64(0x0001000001000101), CONST64(0x0001004001000101), 
+  CONST64(0x0001000000010101), CONST64(0x0001004000010101), CONST64(0x0001000001010101), CONST64(0x0001004001010101), 
+  CONST64(0x0101000000000000), CONST64(0x0101004000000000), CONST64(0x0101000001000000), CONST64(0x0101004001000000), 
+  CONST64(0x0101000000010000), CONST64(0x0101004000010000), CONST64(0x0101000001010000), CONST64(0x0101004001010000), 
+  CONST64(0x0101000000000100), CONST64(0x0101004000000100), CONST64(0x0101000001000100), CONST64(0x0101004001000100), 
+  CONST64(0x0101000000010100), CONST64(0x0101004000010100), CONST64(0x0101000001010100), CONST64(0x0101004001010100), 
+  CONST64(0x0101000000000001), CONST64(0x0101004000000001), CONST64(0x0101000001000001), CONST64(0x0101004001000001), 
+  CONST64(0x0101000000010001), CONST64(0x0101004000010001), CONST64(0x0101000001010001), CONST64(0x0101004001010001), 
+  CONST64(0x0101000000000101), CONST64(0x0101004000000101), CONST64(0x0101000001000101), CONST64(0x0101004001000101), 
+  CONST64(0x0101000000010101), CONST64(0x0101004000010101), CONST64(0x0101000001010101), CONST64(0x0101004001010101), 
+  CONST64(0x0000010000000000), CONST64(0x0000014000000000), CONST64(0x0000010001000000), CONST64(0x0000014001000000), 
+  CONST64(0x0000010000010000), CONST64(0x0000014000010000), CONST64(0x0000010001010000), CONST64(0x0000014001010000), 
+  CONST64(0x0000010000000100), CONST64(0x0000014000000100), CONST64(0x0000010001000100), CONST64(0x0000014001000100), 
+  CONST64(0x0000010000010100), CONST64(0x0000014000010100), CONST64(0x0000010001010100), CONST64(0x0000014001010100), 
+  CONST64(0x0000010000000001), CONST64(0x0000014000000001), CONST64(0x0000010001000001), CONST64(0x0000014001000001), 
+  CONST64(0x0000010000010001), CONST64(0x0000014000010001), CONST64(0x0000010001010001), CONST64(0x0000014001010001), 
+  CONST64(0x0000010000000101), CONST64(0x0000014000000101), CONST64(0x0000010001000101), CONST64(0x0000014001000101), 
+  CONST64(0x0000010000010101), CONST64(0x0000014000010101), CONST64(0x0000010001010101), CONST64(0x0000014001010101), 
+  CONST64(0x0100010000000000), CONST64(0x0100014000000000), CONST64(0x0100010001000000), CONST64(0x0100014001000000), 
+  CONST64(0x0100010000010000), CONST64(0x0100014000010000), CONST64(0x0100010001010000), CONST64(0x0100014001010000), 
+  CONST64(0x0100010000000100), CONST64(0x0100014000000100), CONST64(0x0100010001000100), CONST64(0x0100014001000100), 
+  CONST64(0x0100010000010100), CONST64(0x0100014000010100), CONST64(0x0100010001010100), CONST64(0x0100014001010100), 
+  CONST64(0x0100010000000001), CONST64(0x0100014000000001), CONST64(0x0100010001000001), CONST64(0x0100014001000001), 
+  CONST64(0x0100010000010001), CONST64(0x0100014000010001), CONST64(0x0100010001010001), CONST64(0x0100014001010001), 
+  CONST64(0x0100010000000101), CONST64(0x0100014000000101), CONST64(0x0100010001000101), CONST64(0x0100014001000101), 
+  CONST64(0x0100010000010101), CONST64(0x0100014000010101), CONST64(0x0100010001010101), CONST64(0x0100014001010101), 
+  CONST64(0x0001010000000000), CONST64(0x0001014000000000), CONST64(0x0001010001000000), CONST64(0x0001014001000000), 
+  CONST64(0x0001010000010000), CONST64(0x0001014000010000), CONST64(0x0001010001010000), CONST64(0x0001014001010000), 
+  CONST64(0x0001010000000100), CONST64(0x0001014000000100), CONST64(0x0001010001000100), CONST64(0x0001014001000100), 
+  CONST64(0x0001010000010100), CONST64(0x0001014000010100), CONST64(0x0001010001010100), CONST64(0x0001014001010100), 
+  CONST64(0x0001010000000001), CONST64(0x0001014000000001), CONST64(0x0001010001000001), CONST64(0x0001014001000001), 
+  CONST64(0x0001010000010001), CONST64(0x0001014000010001), CONST64(0x0001010001010001), CONST64(0x0001014001010001), 
+  CONST64(0x0001010000000101), CONST64(0x0001014000000101), CONST64(0x0001010001000101), CONST64(0x0001014001000101), 
+  CONST64(0x0001010000010101), CONST64(0x0001014000010101), CONST64(0x0001010001010101), CONST64(0x0001014001010101), 
+  CONST64(0x0101010000000000), CONST64(0x0101014000000000), CONST64(0x0101010001000000), CONST64(0x0101014001000000), 
+  CONST64(0x0101010000010000), CONST64(0x0101014000010000), CONST64(0x0101010001010000), CONST64(0x0101014001010000), 
+  CONST64(0x0101010000000100), CONST64(0x0101014000000100), CONST64(0x0101010001000100), CONST64(0x0101014001000100), 
+  CONST64(0x0101010000010100), CONST64(0x0101014000010100), CONST64(0x0101010001010100), CONST64(0x0101014001010100), 
+  CONST64(0x0101010000000001), CONST64(0x0101014000000001), CONST64(0x0101010001000001), CONST64(0x0101014001000001), 
+  CONST64(0x0101010000010001), CONST64(0x0101014000010001), CONST64(0x0101010001010001), CONST64(0x0101014001010001), 
+  CONST64(0x0101010000000101), CONST64(0x0101014000000101), CONST64(0x0101010001000101), CONST64(0x0101014001000101), 
+  CONST64(0x0101010000010101), CONST64(0x0101014000010101), CONST64(0x0101010001010101), CONST64(0x0101014001010101)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000100000000), CONST64(0x0000000004000000), CONST64(0x0000000104000000), 
+  CONST64(0x0000000000040000), CONST64(0x0000000100040000), CONST64(0x0000000004040000), CONST64(0x0000000104040000), 
+  CONST64(0x0000000000000400), CONST64(0x0000000100000400), CONST64(0x0000000004000400), CONST64(0x0000000104000400), 
+  CONST64(0x0000000000040400), CONST64(0x0000000100040400), CONST64(0x0000000004040400), CONST64(0x0000000104040400), 
+  CONST64(0x0000000000000004), CONST64(0x0000000100000004), CONST64(0x0000000004000004), CONST64(0x0000000104000004), 
+  CONST64(0x0000000000040004), CONST64(0x0000000100040004), CONST64(0x0000000004040004), CONST64(0x0000000104040004), 
+  CONST64(0x0000000000000404), CONST64(0x0000000100000404), CONST64(0x0000000004000404), CONST64(0x0000000104000404), 
+  CONST64(0x0000000000040404), CONST64(0x0000000100040404), CONST64(0x0000000004040404), CONST64(0x0000000104040404), 
+  CONST64(0x0400000000000000), CONST64(0x0400000100000000), CONST64(0x0400000004000000), CONST64(0x0400000104000000), 
+  CONST64(0x0400000000040000), CONST64(0x0400000100040000), CONST64(0x0400000004040000), CONST64(0x0400000104040000), 
+  CONST64(0x0400000000000400), CONST64(0x0400000100000400), CONST64(0x0400000004000400), CONST64(0x0400000104000400), 
+  CONST64(0x0400000000040400), CONST64(0x0400000100040400), CONST64(0x0400000004040400), CONST64(0x0400000104040400), 
+  CONST64(0x0400000000000004), CONST64(0x0400000100000004), CONST64(0x0400000004000004), CONST64(0x0400000104000004), 
+  CONST64(0x0400000000040004), CONST64(0x0400000100040004), CONST64(0x0400000004040004), CONST64(0x0400000104040004), 
+  CONST64(0x0400000000000404), CONST64(0x0400000100000404), CONST64(0x0400000004000404), CONST64(0x0400000104000404), 
+  CONST64(0x0400000000040404), CONST64(0x0400000100040404), CONST64(0x0400000004040404), CONST64(0x0400000104040404), 
+  CONST64(0x0004000000000000), CONST64(0x0004000100000000), CONST64(0x0004000004000000), CONST64(0x0004000104000000), 
+  CONST64(0x0004000000040000), CONST64(0x0004000100040000), CONST64(0x0004000004040000), CONST64(0x0004000104040000), 
+  CONST64(0x0004000000000400), CONST64(0x0004000100000400), CONST64(0x0004000004000400), CONST64(0x0004000104000400), 
+  CONST64(0x0004000000040400), CONST64(0x0004000100040400), CONST64(0x0004000004040400), CONST64(0x0004000104040400), 
+  CONST64(0x0004000000000004), CONST64(0x0004000100000004), CONST64(0x0004000004000004), CONST64(0x0004000104000004), 
+  CONST64(0x0004000000040004), CONST64(0x0004000100040004), CONST64(0x0004000004040004), CONST64(0x0004000104040004), 
+  CONST64(0x0004000000000404), CONST64(0x0004000100000404), CONST64(0x0004000004000404), CONST64(0x0004000104000404), 
+  CONST64(0x0004000000040404), CONST64(0x0004000100040404), CONST64(0x0004000004040404), CONST64(0x0004000104040404), 
+  CONST64(0x0404000000000000), CONST64(0x0404000100000000), CONST64(0x0404000004000000), CONST64(0x0404000104000000), 
+  CONST64(0x0404000000040000), CONST64(0x0404000100040000), CONST64(0x0404000004040000), CONST64(0x0404000104040000), 
+  CONST64(0x0404000000000400), CONST64(0x0404000100000400), CONST64(0x0404000004000400), CONST64(0x0404000104000400), 
+  CONST64(0x0404000000040400), CONST64(0x0404000100040400), CONST64(0x0404000004040400), CONST64(0x0404000104040400), 
+  CONST64(0x0404000000000004), CONST64(0x0404000100000004), CONST64(0x0404000004000004), CONST64(0x0404000104000004), 
+  CONST64(0x0404000000040004), CONST64(0x0404000100040004), CONST64(0x0404000004040004), CONST64(0x0404000104040004), 
+  CONST64(0x0404000000000404), CONST64(0x0404000100000404), CONST64(0x0404000004000404), CONST64(0x0404000104000404), 
+  CONST64(0x0404000000040404), CONST64(0x0404000100040404), CONST64(0x0404000004040404), CONST64(0x0404000104040404), 
+  CONST64(0x0000040000000000), CONST64(0x0000040100000000), CONST64(0x0000040004000000), CONST64(0x0000040104000000), 
+  CONST64(0x0000040000040000), CONST64(0x0000040100040000), CONST64(0x0000040004040000), CONST64(0x0000040104040000), 
+  CONST64(0x0000040000000400), CONST64(0x0000040100000400), CONST64(0x0000040004000400), CONST64(0x0000040104000400), 
+  CONST64(0x0000040000040400), CONST64(0x0000040100040400), CONST64(0x0000040004040400), CONST64(0x0000040104040400), 
+  CONST64(0x0000040000000004), CONST64(0x0000040100000004), CONST64(0x0000040004000004), CONST64(0x0000040104000004), 
+  CONST64(0x0000040000040004), CONST64(0x0000040100040004), CONST64(0x0000040004040004), CONST64(0x0000040104040004), 
+  CONST64(0x0000040000000404), CONST64(0x0000040100000404), CONST64(0x0000040004000404), CONST64(0x0000040104000404), 
+  CONST64(0x0000040000040404), CONST64(0x0000040100040404), CONST64(0x0000040004040404), CONST64(0x0000040104040404), 
+  CONST64(0x0400040000000000), CONST64(0x0400040100000000), CONST64(0x0400040004000000), CONST64(0x0400040104000000), 
+  CONST64(0x0400040000040000), CONST64(0x0400040100040000), CONST64(0x0400040004040000), CONST64(0x0400040104040000), 
+  CONST64(0x0400040000000400), CONST64(0x0400040100000400), CONST64(0x0400040004000400), CONST64(0x0400040104000400), 
+  CONST64(0x0400040000040400), CONST64(0x0400040100040400), CONST64(0x0400040004040400), CONST64(0x0400040104040400), 
+  CONST64(0x0400040000000004), CONST64(0x0400040100000004), CONST64(0x0400040004000004), CONST64(0x0400040104000004), 
+  CONST64(0x0400040000040004), CONST64(0x0400040100040004), CONST64(0x0400040004040004), CONST64(0x0400040104040004), 
+  CONST64(0x0400040000000404), CONST64(0x0400040100000404), CONST64(0x0400040004000404), CONST64(0x0400040104000404), 
+  CONST64(0x0400040000040404), CONST64(0x0400040100040404), CONST64(0x0400040004040404), CONST64(0x0400040104040404), 
+  CONST64(0x0004040000000000), CONST64(0x0004040100000000), CONST64(0x0004040004000000), CONST64(0x0004040104000000), 
+  CONST64(0x0004040000040000), CONST64(0x0004040100040000), CONST64(0x0004040004040000), CONST64(0x0004040104040000), 
+  CONST64(0x0004040000000400), CONST64(0x0004040100000400), CONST64(0x0004040004000400), CONST64(0x0004040104000400), 
+  CONST64(0x0004040000040400), CONST64(0x0004040100040400), CONST64(0x0004040004040400), CONST64(0x0004040104040400), 
+  CONST64(0x0004040000000004), CONST64(0x0004040100000004), CONST64(0x0004040004000004), CONST64(0x0004040104000004), 
+  CONST64(0x0004040000040004), CONST64(0x0004040100040004), CONST64(0x0004040004040004), CONST64(0x0004040104040004), 
+  CONST64(0x0004040000000404), CONST64(0x0004040100000404), CONST64(0x0004040004000404), CONST64(0x0004040104000404), 
+  CONST64(0x0004040000040404), CONST64(0x0004040100040404), CONST64(0x0004040004040404), CONST64(0x0004040104040404), 
+  CONST64(0x0404040000000000), CONST64(0x0404040100000000), CONST64(0x0404040004000000), CONST64(0x0404040104000000), 
+  CONST64(0x0404040000040000), CONST64(0x0404040100040000), CONST64(0x0404040004040000), CONST64(0x0404040104040000), 
+  CONST64(0x0404040000000400), CONST64(0x0404040100000400), CONST64(0x0404040004000400), CONST64(0x0404040104000400), 
+  CONST64(0x0404040000040400), CONST64(0x0404040100040400), CONST64(0x0404040004040400), CONST64(0x0404040104040400), 
+  CONST64(0x0404040000000004), CONST64(0x0404040100000004), CONST64(0x0404040004000004), CONST64(0x0404040104000004), 
+  CONST64(0x0404040000040004), CONST64(0x0404040100040004), CONST64(0x0404040004040004), CONST64(0x0404040104040004), 
+  CONST64(0x0404040000000404), CONST64(0x0404040100000404), CONST64(0x0404040004000404), CONST64(0x0404040104000404), 
+  CONST64(0x0404040000040404), CONST64(0x0404040100040404), CONST64(0x0404040004040404), CONST64(0x0404040104040404)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000000400000000), CONST64(0x0000000010000000), CONST64(0x0000000410000000), 
+  CONST64(0x0000000000100000), CONST64(0x0000000400100000), CONST64(0x0000000010100000), CONST64(0x0000000410100000), 
+  CONST64(0x0000000000001000), CONST64(0x0000000400001000), CONST64(0x0000000010001000), CONST64(0x0000000410001000), 
+  CONST64(0x0000000000101000), CONST64(0x0000000400101000), CONST64(0x0000000010101000), CONST64(0x0000000410101000), 
+  CONST64(0x0000000000000010), CONST64(0x0000000400000010), CONST64(0x0000000010000010), CONST64(0x0000000410000010), 
+  CONST64(0x0000000000100010), CONST64(0x0000000400100010), CONST64(0x0000000010100010), CONST64(0x0000000410100010), 
+  CONST64(0x0000000000001010), CONST64(0x0000000400001010), CONST64(0x0000000010001010), CONST64(0x0000000410001010), 
+  CONST64(0x0000000000101010), CONST64(0x0000000400101010), CONST64(0x0000000010101010), CONST64(0x0000000410101010), 
+  CONST64(0x1000000000000000), CONST64(0x1000000400000000), CONST64(0x1000000010000000), CONST64(0x1000000410000000), 
+  CONST64(0x1000000000100000), CONST64(0x1000000400100000), CONST64(0x1000000010100000), CONST64(0x1000000410100000), 
+  CONST64(0x1000000000001000), CONST64(0x1000000400001000), CONST64(0x1000000010001000), CONST64(0x1000000410001000), 
+  CONST64(0x1000000000101000), CONST64(0x1000000400101000), CONST64(0x1000000010101000), CONST64(0x1000000410101000), 
+  CONST64(0x1000000000000010), CONST64(0x1000000400000010), CONST64(0x1000000010000010), CONST64(0x1000000410000010), 
+  CONST64(0x1000000000100010), CONST64(0x1000000400100010), CONST64(0x1000000010100010), CONST64(0x1000000410100010), 
+  CONST64(0x1000000000001010), CONST64(0x1000000400001010), CONST64(0x1000000010001010), CONST64(0x1000000410001010), 
+  CONST64(0x1000000000101010), CONST64(0x1000000400101010), CONST64(0x1000000010101010), CONST64(0x1000000410101010), 
+  CONST64(0x0010000000000000), CONST64(0x0010000400000000), CONST64(0x0010000010000000), CONST64(0x0010000410000000), 
+  CONST64(0x0010000000100000), CONST64(0x0010000400100000), CONST64(0x0010000010100000), CONST64(0x0010000410100000), 
+  CONST64(0x0010000000001000), CONST64(0x0010000400001000), CONST64(0x0010000010001000), CONST64(0x0010000410001000), 
+  CONST64(0x0010000000101000), CONST64(0x0010000400101000), CONST64(0x0010000010101000), CONST64(0x0010000410101000), 
+  CONST64(0x0010000000000010), CONST64(0x0010000400000010), CONST64(0x0010000010000010), CONST64(0x0010000410000010), 
+  CONST64(0x0010000000100010), CONST64(0x0010000400100010), CONST64(0x0010000010100010), CONST64(0x0010000410100010), 
+  CONST64(0x0010000000001010), CONST64(0x0010000400001010), CONST64(0x0010000010001010), CONST64(0x0010000410001010), 
+  CONST64(0x0010000000101010), CONST64(0x0010000400101010), CONST64(0x0010000010101010), CONST64(0x0010000410101010), 
+  CONST64(0x1010000000000000), CONST64(0x1010000400000000), CONST64(0x1010000010000000), CONST64(0x1010000410000000), 
+  CONST64(0x1010000000100000), CONST64(0x1010000400100000), CONST64(0x1010000010100000), CONST64(0x1010000410100000), 
+  CONST64(0x1010000000001000), CONST64(0x1010000400001000), CONST64(0x1010000010001000), CONST64(0x1010000410001000), 
+  CONST64(0x1010000000101000), CONST64(0x1010000400101000), CONST64(0x1010000010101000), CONST64(0x1010000410101000), 
+  CONST64(0x1010000000000010), CONST64(0x1010000400000010), CONST64(0x1010000010000010), CONST64(0x1010000410000010), 
+  CONST64(0x1010000000100010), CONST64(0x1010000400100010), CONST64(0x1010000010100010), CONST64(0x1010000410100010), 
+  CONST64(0x1010000000001010), CONST64(0x1010000400001010), CONST64(0x1010000010001010), CONST64(0x1010000410001010), 
+  CONST64(0x1010000000101010), CONST64(0x1010000400101010), CONST64(0x1010000010101010), CONST64(0x1010000410101010), 
+  CONST64(0x0000100000000000), CONST64(0x0000100400000000), CONST64(0x0000100010000000), CONST64(0x0000100410000000), 
+  CONST64(0x0000100000100000), CONST64(0x0000100400100000), CONST64(0x0000100010100000), CONST64(0x0000100410100000), 
+  CONST64(0x0000100000001000), CONST64(0x0000100400001000), CONST64(0x0000100010001000), CONST64(0x0000100410001000), 
+  CONST64(0x0000100000101000), CONST64(0x0000100400101000), CONST64(0x0000100010101000), CONST64(0x0000100410101000), 
+  CONST64(0x0000100000000010), CONST64(0x0000100400000010), CONST64(0x0000100010000010), CONST64(0x0000100410000010), 
+  CONST64(0x0000100000100010), CONST64(0x0000100400100010), CONST64(0x0000100010100010), CONST64(0x0000100410100010), 
+  CONST64(0x0000100000001010), CONST64(0x0000100400001010), CONST64(0x0000100010001010), CONST64(0x0000100410001010), 
+  CONST64(0x0000100000101010), CONST64(0x0000100400101010), CONST64(0x0000100010101010), CONST64(0x0000100410101010), 
+  CONST64(0x1000100000000000), CONST64(0x1000100400000000), CONST64(0x1000100010000000), CONST64(0x1000100410000000), 
+  CONST64(0x1000100000100000), CONST64(0x1000100400100000), CONST64(0x1000100010100000), CONST64(0x1000100410100000), 
+  CONST64(0x1000100000001000), CONST64(0x1000100400001000), CONST64(0x1000100010001000), CONST64(0x1000100410001000), 
+  CONST64(0x1000100000101000), CONST64(0x1000100400101000), CONST64(0x1000100010101000), CONST64(0x1000100410101000), 
+  CONST64(0x1000100000000010), CONST64(0x1000100400000010), CONST64(0x1000100010000010), CONST64(0x1000100410000010), 
+  CONST64(0x1000100000100010), CONST64(0x1000100400100010), CONST64(0x1000100010100010), CONST64(0x1000100410100010), 
+  CONST64(0x1000100000001010), CONST64(0x1000100400001010), CONST64(0x1000100010001010), CONST64(0x1000100410001010), 
+  CONST64(0x1000100000101010), CONST64(0x1000100400101010), CONST64(0x1000100010101010), CONST64(0x1000100410101010), 
+  CONST64(0x0010100000000000), CONST64(0x0010100400000000), CONST64(0x0010100010000000), CONST64(0x0010100410000000), 
+  CONST64(0x0010100000100000), CONST64(0x0010100400100000), CONST64(0x0010100010100000), CONST64(0x0010100410100000), 
+  CONST64(0x0010100000001000), CONST64(0x0010100400001000), CONST64(0x0010100010001000), CONST64(0x0010100410001000), 
+  CONST64(0x0010100000101000), CONST64(0x0010100400101000), CONST64(0x0010100010101000), CONST64(0x0010100410101000), 
+  CONST64(0x0010100000000010), CONST64(0x0010100400000010), CONST64(0x0010100010000010), CONST64(0x0010100410000010), 
+  CONST64(0x0010100000100010), CONST64(0x0010100400100010), CONST64(0x0010100010100010), CONST64(0x0010100410100010), 
+  CONST64(0x0010100000001010), CONST64(0x0010100400001010), CONST64(0x0010100010001010), CONST64(0x0010100410001010), 
+  CONST64(0x0010100000101010), CONST64(0x0010100400101010), CONST64(0x0010100010101010), CONST64(0x0010100410101010), 
+  CONST64(0x1010100000000000), CONST64(0x1010100400000000), CONST64(0x1010100010000000), CONST64(0x1010100410000000), 
+  CONST64(0x1010100000100000), CONST64(0x1010100400100000), CONST64(0x1010100010100000), CONST64(0x1010100410100000), 
+  CONST64(0x1010100000001000), CONST64(0x1010100400001000), CONST64(0x1010100010001000), CONST64(0x1010100410001000), 
+  CONST64(0x1010100000101000), CONST64(0x1010100400101000), CONST64(0x1010100010101000), CONST64(0x1010100410101000), 
+  CONST64(0x1010100000000010), CONST64(0x1010100400000010), CONST64(0x1010100010000010), CONST64(0x1010100410000010), 
+  CONST64(0x1010100000100010), CONST64(0x1010100400100010), CONST64(0x1010100010100010), CONST64(0x1010100410100010), 
+  CONST64(0x1010100000001010), CONST64(0x1010100400001010), CONST64(0x1010100010001010), CONST64(0x1010100410001010), 
+  CONST64(0x1010100000101010), CONST64(0x1010100400101010), CONST64(0x1010100010101010), CONST64(0x1010100410101010)
+  }, 
+{ CONST64(0x0000000000000000), CONST64(0x0000001000000000), CONST64(0x0000000040000000), CONST64(0x0000001040000000), 
+  CONST64(0x0000000000400000), CONST64(0x0000001000400000), CONST64(0x0000000040400000), CONST64(0x0000001040400000), 
+  CONST64(0x0000000000004000), CONST64(0x0000001000004000), CONST64(0x0000000040004000), CONST64(0x0000001040004000), 
+  CONST64(0x0000000000404000), CONST64(0x0000001000404000), CONST64(0x0000000040404000), CONST64(0x0000001040404000), 
+  CONST64(0x0000000000000040), CONST64(0x0000001000000040), CONST64(0x0000000040000040), CONST64(0x0000001040000040), 
+  CONST64(0x0000000000400040), CONST64(0x0000001000400040), CONST64(0x0000000040400040), CONST64(0x0000001040400040), 
+  CONST64(0x0000000000004040), CONST64(0x0000001000004040), CONST64(0x0000000040004040), CONST64(0x0000001040004040), 
+  CONST64(0x0000000000404040), CONST64(0x0000001000404040), CONST64(0x0000000040404040), CONST64(0x0000001040404040), 
+  CONST64(0x4000000000000000), CONST64(0x4000001000000000), CONST64(0x4000000040000000), CONST64(0x4000001040000000), 
+  CONST64(0x4000000000400000), CONST64(0x4000001000400000), CONST64(0x4000000040400000), CONST64(0x4000001040400000), 
+  CONST64(0x4000000000004000), CONST64(0x4000001000004000), CONST64(0x4000000040004000), CONST64(0x4000001040004000), 
+  CONST64(0x4000000000404000), CONST64(0x4000001000404000), CONST64(0x4000000040404000), CONST64(0x4000001040404000), 
+  CONST64(0x4000000000000040), CONST64(0x4000001000000040), CONST64(0x4000000040000040), CONST64(0x4000001040000040), 
+  CONST64(0x4000000000400040), CONST64(0x4000001000400040), CONST64(0x4000000040400040), CONST64(0x4000001040400040), 
+  CONST64(0x4000000000004040), CONST64(0x4000001000004040), CONST64(0x4000000040004040), CONST64(0x4000001040004040), 
+  CONST64(0x4000000000404040), CONST64(0x4000001000404040), CONST64(0x4000000040404040), CONST64(0x4000001040404040), 
+  CONST64(0x0040000000000000), CONST64(0x0040001000000000), CONST64(0x0040000040000000), CONST64(0x0040001040000000), 
+  CONST64(0x0040000000400000), CONST64(0x0040001000400000), CONST64(0x0040000040400000), CONST64(0x0040001040400000), 
+  CONST64(0x0040000000004000), CONST64(0x0040001000004000), CONST64(0x0040000040004000), CONST64(0x0040001040004000), 
+  CONST64(0x0040000000404000), CONST64(0x0040001000404000), CONST64(0x0040000040404000), CONST64(0x0040001040404000), 
+  CONST64(0x0040000000000040), CONST64(0x0040001000000040), CONST64(0x0040000040000040), CONST64(0x0040001040000040), 
+  CONST64(0x0040000000400040), CONST64(0x0040001000400040), CONST64(0x0040000040400040), CONST64(0x0040001040400040), 
+  CONST64(0x0040000000004040), CONST64(0x0040001000004040), CONST64(0x0040000040004040), CONST64(0x0040001040004040), 
+  CONST64(0x0040000000404040), CONST64(0x0040001000404040), CONST64(0x0040000040404040), CONST64(0x0040001040404040), 
+  CONST64(0x4040000000000000), CONST64(0x4040001000000000), CONST64(0x4040000040000000), CONST64(0x4040001040000000), 
+  CONST64(0x4040000000400000), CONST64(0x4040001000400000), CONST64(0x4040000040400000), CONST64(0x4040001040400000), 
+  CONST64(0x4040000000004000), CONST64(0x4040001000004000), CONST64(0x4040000040004000), CONST64(0x4040001040004000), 
+  CONST64(0x4040000000404000), CONST64(0x4040001000404000), CONST64(0x4040000040404000), CONST64(0x4040001040404000), 
+  CONST64(0x4040000000000040), CONST64(0x4040001000000040), CONST64(0x4040000040000040), CONST64(0x4040001040000040), 
+  CONST64(0x4040000000400040), CONST64(0x4040001000400040), CONST64(0x4040000040400040), CONST64(0x4040001040400040), 
+  CONST64(0x4040000000004040), CONST64(0x4040001000004040), CONST64(0x4040000040004040), CONST64(0x4040001040004040), 
+  CONST64(0x4040000000404040), CONST64(0x4040001000404040), CONST64(0x4040000040404040), CONST64(0x4040001040404040), 
+  CONST64(0x0000400000000000), CONST64(0x0000401000000000), CONST64(0x0000400040000000), CONST64(0x0000401040000000), 
+  CONST64(0x0000400000400000), CONST64(0x0000401000400000), CONST64(0x0000400040400000), CONST64(0x0000401040400000), 
+  CONST64(0x0000400000004000), CONST64(0x0000401000004000), CONST64(0x0000400040004000), CONST64(0x0000401040004000), 
+  CONST64(0x0000400000404000), CONST64(0x0000401000404000), CONST64(0x0000400040404000), CONST64(0x0000401040404000), 
+  CONST64(0x0000400000000040), CONST64(0x0000401000000040), CONST64(0x0000400040000040), CONST64(0x0000401040000040), 
+  CONST64(0x0000400000400040), CONST64(0x0000401000400040), CONST64(0x0000400040400040), CONST64(0x0000401040400040), 
+  CONST64(0x0000400000004040), CONST64(0x0000401000004040), CONST64(0x0000400040004040), CONST64(0x0000401040004040), 
+  CONST64(0x0000400000404040), CONST64(0x0000401000404040), CONST64(0x0000400040404040), CONST64(0x0000401040404040), 
+  CONST64(0x4000400000000000), CONST64(0x4000401000000000), CONST64(0x4000400040000000), CONST64(0x4000401040000000), 
+  CONST64(0x4000400000400000), CONST64(0x4000401000400000), CONST64(0x4000400040400000), CONST64(0x4000401040400000), 
+  CONST64(0x4000400000004000), CONST64(0x4000401000004000), CONST64(0x4000400040004000), CONST64(0x4000401040004000), 
+  CONST64(0x4000400000404000), CONST64(0x4000401000404000), CONST64(0x4000400040404000), CONST64(0x4000401040404000), 
+  CONST64(0x4000400000000040), CONST64(0x4000401000000040), CONST64(0x4000400040000040), CONST64(0x4000401040000040), 
+  CONST64(0x4000400000400040), CONST64(0x4000401000400040), CONST64(0x4000400040400040), CONST64(0x4000401040400040), 
+  CONST64(0x4000400000004040), CONST64(0x4000401000004040), CONST64(0x4000400040004040), CONST64(0x4000401040004040), 
+  CONST64(0x4000400000404040), CONST64(0x4000401000404040), CONST64(0x4000400040404040), CONST64(0x4000401040404040), 
+  CONST64(0x0040400000000000), CONST64(0x0040401000000000), CONST64(0x0040400040000000), CONST64(0x0040401040000000), 
+  CONST64(0x0040400000400000), CONST64(0x0040401000400000), CONST64(0x0040400040400000), CONST64(0x0040401040400000), 
+  CONST64(0x0040400000004000), CONST64(0x0040401000004000), CONST64(0x0040400040004000), CONST64(0x0040401040004000), 
+  CONST64(0x0040400000404000), CONST64(0x0040401000404000), CONST64(0x0040400040404000), CONST64(0x0040401040404000), 
+  CONST64(0x0040400000000040), CONST64(0x0040401000000040), CONST64(0x0040400040000040), CONST64(0x0040401040000040), 
+  CONST64(0x0040400000400040), CONST64(0x0040401000400040), CONST64(0x0040400040400040), CONST64(0x0040401040400040), 
+  CONST64(0x0040400000004040), CONST64(0x0040401000004040), CONST64(0x0040400040004040), CONST64(0x0040401040004040), 
+  CONST64(0x0040400000404040), CONST64(0x0040401000404040), CONST64(0x0040400040404040), CONST64(0x0040401040404040), 
+  CONST64(0x4040400000000000), CONST64(0x4040401000000000), CONST64(0x4040400040000000), CONST64(0x4040401040000000), 
+  CONST64(0x4040400000400000), CONST64(0x4040401000400000), CONST64(0x4040400040400000), CONST64(0x4040401040400000), 
+  CONST64(0x4040400000004000), CONST64(0x4040401000004000), CONST64(0x4040400040004000), CONST64(0x4040401040004000), 
+  CONST64(0x4040400000404000), CONST64(0x4040401000404000), CONST64(0x4040400040404000), CONST64(0x4040401040404000), 
+  CONST64(0x4040400000000040), CONST64(0x4040401000000040), CONST64(0x4040400040000040), CONST64(0x4040401040000040), 
+  CONST64(0x4040400000400040), CONST64(0x4040401000400040), CONST64(0x4040400040400040), CONST64(0x4040401040400040), 
+  CONST64(0x4040400000004040), CONST64(0x4040401000004040), CONST64(0x4040400040004040), CONST64(0x4040401040004040), 
+  CONST64(0x4040400000404040), CONST64(0x4040401000404040), CONST64(0x4040400040404040), CONST64(0x4040401040404040)
+  }};
+  
+#endif
+
+
+static void cookey(const ulong32 *raw1, ulong32 *keyout);
 
 #ifdef CLEAN_STACK
-void _deskey(const unsigned char *key, short edf, unsigned long *keyout)
+void _deskey(const unsigned char *key, short edf, ulong32 *keyout)
 #else
-void deskey(const unsigned char *key, short edf, unsigned long *keyout)
+void deskey(const unsigned char *key, short edf, ulong32 *keyout)
 #endif
 {
-    unsigned long i, j, l, m, n, kn[32];
+    ulong32 i, j, l, m, n, kn[32];
     unsigned char pc1m[56], pcr[56];
 
     for (j=0; j < 56; j++) {
-        l = (unsigned long)pc1[j];
+        l = (ulong32)pc1[j];
         m = l & 7;
         pc1m[j] = (unsigned char)((key[l >> 3U] & bytebit[m]) == bytebit[m] ? 1 : 0);
     }
@@ -256,7 +1306,7 @@ void deskey(const unsigned char *key, short edf, unsigned long *keyout)
         n = m + 1;
         kn[m] = kn[n] = 0L;
         for (j=0; j < 28; j++) {
-            l = j + (unsigned long)totrot[i];
+            l = j + (ulong32)totrot[i];
             if (l < 28) {
                pcr[j] = pc1m[l];
             } else {
@@ -264,7 +1314,7 @@ void deskey(const unsigned char *key, short edf, unsigned long *keyout)
             }
         }
         for (/*j = 28*/; j < 56; j++) {
-            l = j + (unsigned long)totrot[i];
+            l = j + (ulong32)totrot[i];
             if (l < 56) {
                pcr[j] = pc1m[l];
             } else {
@@ -285,22 +1335,22 @@ void deskey(const unsigned char *key, short edf, unsigned long *keyout)
 }
 
 #ifdef CLEAN_STACK
-void deskey(const unsigned char *key, short edf, unsigned long *keyout)
+void deskey(const unsigned char *key, short edf, ulong32 *keyout)
 {
    _deskey(key, edf, keyout);
-   burn_stack(sizeof(int)*5 + sizeof(unsigned long)*32 + sizeof(unsigned char)*112);
+   burn_stack(sizeof(int)*5 + sizeof(ulong32)*32 + sizeof(unsigned char)*112);
 }
 #endif
 
 #ifdef CLEAN_STACK
-static void _cookey(const unsigned long *raw1, unsigned long *keyout)
+static void _cookey(const ulong32 *raw1, ulong32 *keyout)
 #else
-static void cookey(const unsigned long *raw1, unsigned long *keyout)
+static void cookey(const ulong32 *raw1, ulong32 *keyout)
 #endif
 {
-    unsigned long *cook;
-    const unsigned long *raw0;
-    unsigned long dough[32];
+    ulong32 *cook;
+    const ulong32 *raw0;
+    ulong32 dough[32];
     int i;
 
     cook = dough;
@@ -321,25 +1371,26 @@ static void cookey(const unsigned long *raw1, unsigned long *keyout)
 }
 
 #ifdef CLEAN_STACK
-static void cookey(const unsigned long *raw1, unsigned long *keyout)
+static void cookey(const ulong32 *raw1, ulong32 *keyout)
 {
    _cookey(raw1, keyout);
-   burn_stack(sizeof(unsigned long *) * 2 + sizeof(unsigned long)*32 + sizeof(int));
+   burn_stack(sizeof(ulong32 *) * 2 + sizeof(ulong32)*32 + sizeof(int));
 }
 #endif
 
 #ifndef CLEAN_STACK
-static void desfunc(unsigned long *block, const unsigned long *keys)
+static void desfunc(ulong32 *block, const ulong32 *keys)
 #else
-static void _desfunc(unsigned long *block, const unsigned long *keys)
+static void _desfunc(ulong32 *block, const ulong32 *keys)
 #endif
 {
-    unsigned long work, right, leftt;
+    ulong32 work, right, leftt;
     int round;
 
     leftt = block[0];
     right = block[1];
 
+#ifdef SMALL_CODE
     work = ((leftt >> 4)  ^ right) & 0x0f0f0f0fL;
     right ^= work;
     leftt ^= (work << 4);
@@ -362,6 +1413,21 @@ static void _desfunc(unsigned long *block, const unsigned long *keys)
     leftt ^= work;
     right ^= work;
     leftt = ROL(leftt, 1);
+#else 
+   {
+      ulong64 tmp;
+      tmp = des_ip[0][byte(leftt, 0)] ^
+            des_ip[1][byte(leftt, 1)] ^
+            des_ip[2][byte(leftt, 2)] ^
+            des_ip[3][byte(leftt, 3)] ^
+            des_ip[4][byte(right, 0)] ^
+            des_ip[5][byte(right, 1)] ^
+            des_ip[6][byte(right, 2)] ^
+            des_ip[7][byte(right, 3)];
+      leftt = (ulong32)(tmp >> 32);
+      right = (ulong32)(tmp & 0xFFFFFFFFUL);
+   }
+#endif
 
     for (round = 0; round < 8; round++) {
         work  = ROR(right, 4) ^ *keys++;
@@ -386,11 +1452,13 @@ static void _desfunc(unsigned long *block, const unsigned long *keys)
               ^  SP4[(work >> 16) & 0x3fL]
               ^  SP2[(work >> 24) & 0x3fL];
     }
-    right = (right << 31) | (right >> 1);
+
+#ifdef SMALL_CODE    
+    right = ROR(right, 1);
     work = (leftt ^ right) & 0xaaaaaaaaL;
     leftt ^= work;
     right ^= work;
-    leftt = (leftt << 31) | (leftt >> 1);
+    leftt = ROR(leftt, 1);
     work = ((leftt >> 8) ^ right) & 0x00ff00ffL;
     right ^= work;
     leftt ^= (work << 8);
@@ -404,16 +1472,31 @@ static void _desfunc(unsigned long *block, const unsigned long *keys)
     work = ((right >> 4) ^ leftt) & 0x0f0f0f0fL;
     leftt ^= work;
     right ^= (work << 4);
+#else 
+   {
+      ulong64 tmp;
+      tmp = des_fp[0][byte(leftt, 0)] ^
+            des_fp[1][byte(leftt, 1)] ^
+            des_fp[2][byte(leftt, 2)] ^
+            des_fp[3][byte(leftt, 3)] ^
+            des_fp[4][byte(right, 0)] ^
+            des_fp[5][byte(right, 1)] ^
+            des_fp[6][byte(right, 2)] ^
+            des_fp[7][byte(right, 3)];
+      leftt = (ulong32)(tmp >> 32);
+      right = (ulong32)(tmp & 0xFFFFFFFFUL);
+   }
+#endif
     
     block[0] = right;
     block[1] = leftt;
 }
 
 #ifdef CLEAN_STACK
-static void desfunc(unsigned long *block, const unsigned long *keys)
+static void desfunc(ulong32 *block, const ulong32 *keys)
 {
    _desfunc(block, keys);
-   burn_stack(sizeof(unsigned long) * 4 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 4 + sizeof(int));
 }
 #endif
 
@@ -462,7 +1545,7 @@ int des3_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_k
 
 void des_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
-    unsigned long work[2];
+    ulong32 work[2];
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
     _ARGCHK(key != NULL);
@@ -475,7 +1558,7 @@ void des_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *
 
 void des_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
-    unsigned long work[2];
+    ulong32 work[2];
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
     _ARGCHK(key != NULL);
@@ -488,7 +1571,7 @@ void des_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *
 
 void des3_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
-    unsigned long work[2];
+    ulong32 work[2];
     
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
@@ -504,7 +1587,7 @@ void des3_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key
 
 void des3_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
-    unsigned long work[2];
+    ulong32 work[2];
     _ARGCHK(pt != NULL);
     _ARGCHK(ct != NULL);
     _ARGCHK(key != NULL);
diff --git a/dh.c b/dh.c
index 7540292..d23632d 100644
--- a/dh.c
+++ b/dh.c
@@ -155,8 +155,8 @@ int dh_test(void)
 #if 0
         printf("dh_test():testing size %d-bits\n", sets[x].size * 8);
 #endif
-        if (mp_read_radix(&g,(unsigned char *)sets[x].base, 64) != MP_OKAY)   { goto error; }
-        if (mp_read_radix(&p,(unsigned char *)sets[x].prime, 64) != MP_OKAY)  { goto error; }
+        if (mp_read_radix(&g,(char *)sets[x].base, 64) != MP_OKAY)   { goto error; }
+        if (mp_read_radix(&p,(char *)sets[x].prime, 64) != MP_OKAY)  { goto error; }
 
         /* ensure p is prime */
         if ((res = is_prime(&p, &primality)) != CRYPT_OK)             { goto done; }
@@ -464,7 +464,7 @@ int dh_shared_secret(dh_key *private_key, dh_key *public_key,
       return CRYPT_MEM;
    }
 
-   if (mp_read_radix(&p, (unsigned char *)sets[private_key->idx].prime, 64) != MP_OKAY)     { goto error; }
+   if (mp_read_radix(&p, (char *)sets[private_key->idx].prime, 64) != MP_OKAY)     { goto error; }
    if (mp_exptmod(&public_key->y, &private_key->x, &p, &tmp) != MP_OKAY)                    { goto error; }
 
    /* enough space for output? */
diff --git a/ecc.c b/ecc.c
index 13f55e8..5bc6398 100644
--- a/ecc.c
+++ b/ecc.c
@@ -528,8 +528,8 @@ int ecc_test(void)
        #if 0
           printf("Testing %d\n", sets[i].size);
        #endif
-       if (mp_read_radix(&modulus, (unsigned char *)sets[i].prime, 64) != MP_OKAY)   { goto error; }
-       if (mp_read_radix(&order, (unsigned char *)sets[i].order, 64) != MP_OKAY)     { goto error; }
+       if (mp_read_radix(&modulus, (char *)sets[i].prime, 64) != MP_OKAY)   { goto error; }
+       if (mp_read_radix(&order, (char *)sets[i].order, 64) != MP_OKAY)     { goto error; }
 
        /* is prime actually prime? */
        if (is_prime(&modulus, &primality) != CRYPT_OK)           { goto error; }
@@ -545,8 +545,8 @@ int ecc_test(void)
           goto done1;
        }
 
-       if (mp_read_radix(&G->x, (unsigned char *)sets[i].Gx, 64) != MP_OKAY) { goto error; }
-       if (mp_read_radix(&G->y, (unsigned char *)sets[i].Gy, 64) != MP_OKAY) { goto error; }
+       if (mp_read_radix(&G->x, (char *)sets[i].Gx, 64) != MP_OKAY) { goto error; }
+       if (mp_read_radix(&G->y, (char *)sets[i].Gy, 64) != MP_OKAY) { goto error; }
 
        /* then we should have G == (order + 1)G */
        if (mp_add_d(&order, 1, &order) != MP_OKAY)                  { goto error; }
@@ -624,9 +624,9 @@ int ecc_make_key(prng_state *prng, int wprng, int keysize, ecc_key *key)
    }
 
    /* read in the specs for this key */
-   if (mp_read_radix(&prime, (unsigned char *)sets[key->idx].prime, 64) != MP_OKAY)  { goto error; }
-   if (mp_read_radix(&base->x, (unsigned char *)sets[key->idx].Gx, 64) != MP_OKAY)   { goto error; }
-   if (mp_read_radix(&base->y, (unsigned char *)sets[key->idx].Gy, 64) != MP_OKAY)   { goto error; }
+   if (mp_read_radix(&prime, (char *)sets[key->idx].prime, 64) != MP_OKAY)  { goto error; }
+   if (mp_read_radix(&base->x, (char *)sets[key->idx].Gx, 64) != MP_OKAY)   { goto error; }
+   if (mp_read_radix(&base->y, (char *)sets[key->idx].Gy, 64) != MP_OKAY)   { goto error; }
    if (mp_read_unsigned_bin(&key->k, (unsigned char *)buf, keysize) != MP_OKAY)      { goto error; }
 
    /* make the public key */
@@ -671,12 +671,12 @@ static int compress_y_point(ecc_point *pt, int idx, int *result)
    }
 
    /* get x^3 - 3x + b */
-   if (mp_read_radix(&p, (unsigned char *)sets[idx].B, 64) != MP_OKAY) { goto error; } /* p = B */
+   if (mp_read_radix(&p, (char *)sets[idx].B, 64) != MP_OKAY) { goto error; } /* p = B */
    if (mp_expt_d(&pt->x, 3, &tmp) != MP_OKAY)              { goto error; } /* tmp = pX^3  */
    if (mp_mul_d(&pt->x, 3, &tmp2) != MP_OKAY)              { goto error; } /* tmp2 = 3*pX^3 */
    if (mp_sub(&tmp, &tmp2, &tmp) != MP_OKAY)               { goto error; } /* tmp = tmp - tmp2 */
    if (mp_add(&tmp, &p, &tmp) != MP_OKAY)                  { goto error; } /* tmp = tmp + p */
-   if (mp_read_radix(&p, (unsigned char *)sets[idx].prime, 64) != MP_OKAY)  { goto error; } /* p = prime */
+   if (mp_read_radix(&p, (char *)sets[idx].prime, 64) != MP_OKAY)  { goto error; } /* p = prime */
    if (mp_mod(&tmp, &p, &tmp) != MP_OKAY)                  { goto error; } /* tmp = tmp mod p */
 
    /* now find square root */
@@ -713,12 +713,12 @@ static int expand_y_point(ecc_point *pt, int idx, int result)
    }
 
    /* get x^3 - 3x + b */
-   if (mp_read_radix(&p, (unsigned char *)sets[idx].B, 64) != MP_OKAY) { goto error; } /* p = B */
+   if (mp_read_radix(&p, (char *)sets[idx].B, 64) != MP_OKAY) { goto error; } /* p = B */
    if (mp_expt_d(&pt->x, 3, &tmp) != MP_OKAY)              { goto error; } /* tmp = pX^3 */
    if (mp_mul_d(&pt->x, 3, &tmp2) != MP_OKAY)              { goto error; } /* tmp2 = 3*pX^3 */
    if (mp_sub(&tmp, &tmp2, &tmp) != MP_OKAY)               { goto error; } /* tmp = tmp - tmp2 */
    if (mp_add(&tmp, &p, &tmp) != MP_OKAY)                  { goto error; } /* tmp = tmp + p */
-   if (mp_read_radix(&p, (unsigned char *)sets[idx].prime, 64) != MP_OKAY)  { goto error; } /* p = prime */
+   if (mp_read_radix(&p, (char *)sets[idx].prime, 64) != MP_OKAY)  { goto error; } /* p = prime */
    if (mp_mod(&tmp, &p, &tmp) != MP_OKAY)                  { goto error; } /* tmp = tmp mod p */
 
    /* now find square root */
@@ -935,7 +935,7 @@ int ecc_shared_secret(ecc_key *private_key, ecc_key *public_key,
       return CRYPT_MEM;
    }
 
-   if (mp_read_radix(&prime, (unsigned char *)sets[private_key->idx].prime, 64) != MP_OKAY)  { goto error; }
+   if (mp_read_radix(&prime, (char *)sets[private_key->idx].prime, 64) != MP_OKAY)  { goto error; }
    if ((res = ecc_mulmod(&private_key->k, &public_key->pubkey, result, &prime)) != CRYPT_OK) { goto done1; }
 
    x = (unsigned long)mp_unsigned_bin_size(&result->x);
diff --git a/ecc_sys.c b/ecc_sys.c
index e15e473..1dd19e1 100644
--- a/ecc_sys.c
+++ b/ecc_sys.c
@@ -238,7 +238,7 @@ int ecc_sign_hash(const unsigned char *in,  unsigned long inlen,
       ecc_free(&pubkey);
       return CRYPT_MEM;
    }
-   if (mp_read_radix(&p, (unsigned char *)sets[key->idx].order, 64) != MP_OKAY)     { goto error; }
+   if (mp_read_radix(&p, (char *)sets[key->idx].order, 64) != MP_OKAY)     { goto error; }
    if (mp_read_unsigned_bin(&b, (unsigned char *)in, (int)inlen) != MP_OKAY)        { goto error; }
 
    /* find b = (m - x)/k */
@@ -389,7 +389,7 @@ int ecc_verify_hash(const unsigned char *sig, unsigned long siglen,
    if (mp_read_unsigned_bin(&m, (unsigned char *)hash, (int)inlen) != MP_OKAY)     { goto error; }
    
    /* load prime */
-   if (mp_read_radix(&p, (unsigned char *)sets[key->idx].prime, 64) != MP_OKAY)    { goto error; }
+   if (mp_read_radix(&p, (char *)sets[key->idx].prime, 64) != MP_OKAY)    { goto error; }
    
    /* calculate barrett stuff */
    mp_set(&mu, 1); 
@@ -406,8 +406,8 @@ int ecc_verify_hash(const unsigned char *sig, unsigned long siglen,
    if (add_point(&pubkey.pubkey, &key->pubkey, &pubkey.pubkey, &p, &mu) != CRYPT_OK)    { goto error; }
 
    /* get mG */
-   if (mp_read_radix(&mG->x, (unsigned char *)sets[key->idx].Gx, 64) != MP_OKAY)   { goto error; }
-   if (mp_read_radix(&mG->y, (unsigned char *)sets[key->idx].Gy, 64) != MP_OKAY)   { goto error; }
+   if (mp_read_radix(&mG->x, (char *)sets[key->idx].Gx, 64) != MP_OKAY)   { goto error; }
+   if (mp_read_radix(&mG->y, (char *)sets[key->idx].Gy, 64) != MP_OKAY)   { goto error; }
    if (ecc_mulmod(&m, mG, mG, &p) != CRYPT_OK)                                     { goto error; }
 
    /* compare mG to bA + Y */
diff --git a/makefile b/makefile
index cfec020..a5b2ead 100644
--- a/makefile
+++ b/makefile
@@ -9,7 +9,7 @@
 # a build. This is easy to remedy though, for those that have problems.
 
 # The version
-VERSION=0.87
+VERSION=0.88
 
 #ch1-01-1
 # Compiler and Linker Names
@@ -28,9 +28,8 @@ CFLAGS += -c -I./ -Wall -Wsign-compare -W -Wno-unused -Wshadow -Werror
 # optimize for SPEED
 #CFLAGS += -O3 -funroll-loops
 
-#add -fomit-frame-pointer.  v3.2 is buggy for certain platforms so this is used for files it is known to work for
-#default is off but you may enable this to get further performance [make sure you run the test suite!]
-#EXT_CFLAGS = -fomit-frame-pointer
+#add -fomit-frame-pointer.  v3.2 is buggy for certain platforms!
+#CFLAGS += -fomit-frame-pointer
 
 # optimize for SIZE
 CFLAGS += -Os
@@ -48,6 +47,8 @@ HASH=hashsum
 CRYPT=encrypt
 SMALL=small
 PROF=x86_prof
+TV=tv_gen
+
 
 #LIBPATH-The directory for libtomcrypt to be installed to.
 #INCPATH-The directory to install the header files for libtomcrypt.
@@ -58,17 +59,22 @@ INCPATH=/usr/include
 DATAPATH=/usr/share/doc/libtomcrypt/pdf
 
 #List of objects to compile.
+
+#Leave MPI built-in or force developer to link against libtommath?
+MPIOBJECT=mpi.o
+
 OBJECTS=keyring.o gf.o mem.o sprng.o ecc.o base64.o dh.o rsa.o \
 bits.o yarrow.o cfb.o ofb.o ecb.o ctr.o cbc.o hash.o tiger.o sha1.o \
 md5.o md4.o md2.o sha256.o sha512.o xtea.o aes.o des.o \
 safer_tab.o safer.o safer+.o rc4.o rc2.o rc6.o rc5.o cast5.o noekeon.o blowfish.o crypt.o \
-mpi.o prime.o twofish.o packet.o hmac.o strings.o 
+prime.o twofish.o packet.o hmac.o strings.o $(MPIOBJECT)
 
 TESTOBJECTS=demos/test.o
 HASHOBJECTS=demos/hashsum.o
 CRYPTOBJECTS=demos/encrypt.o
 SMALLOBJECTS=demos/small.o
 PROFS=demos/x86_prof.o
+TVS=demos/tv_gen.o
 
 #Files left over from making the crypt.pdf.
 LEFTOVERS=*.dvi *.log *.aux *.toc *.idx *.ilg *.ind
@@ -91,45 +97,8 @@ dh.o: dh.c dh_sys.c
 aes.o: aes.c aes_tab.c
 sha512.o: sha512.c sha384.c
 
-#These are objects that are known to build with -fomit-frame-pointer successfully [RISK!]
-aes.o: aes.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c aes.c
-
-blowfish.o: blowfish.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c blowfish.c
-	
-cast5.o: cast5.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c cast5.c
-	
-des.o: des.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c des.c
-	
-twofish.o: twofish.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c twofish.c
-	
-md2.o: md2.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c md2.c
-
-md4.o: md4.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c md4.c
-	
-md5.o: md5.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c md5.c
-
-sha1.o: sha1.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c sha1.c
-	
-sha256.o: sha256.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c sha256.c
-
-sha512.o: sha512.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c sha512.c
-	
-tiger.o: tiger.c
-	$(CC) $(CFLAGS) $(EXT_CFLAGS) -c tiger.c
-
 #This rule makes the libtomcrypt library.
-library: $(OBJECTS) 
+library: $(OBJECTS)
 	$(AR) $(ARFLAGS) $(LIBNAME) $(OBJECTS)
 	ranlib $(LIBNAME)
 
@@ -150,7 +119,10 @@ small: library $(SMALLOBJECTS)
 	$(CC) $(SMALLOBJECTS) $(LIBNAME) -o $(SMALL) $(WARN)
 	
 x86_prof: library $(PROFS)
-	$(CC) demos/x86_prof.o $(LIBNAME) -o $(PROF)
+	$(CC) $(PROFS) $(LIBNAME) -o $(PROF)
+
+tv_gen: library $(TVS)
+	$(CC) $(TVS) $(LIBNAME) -o $(TV)
 
 #This rule installs the library and the header files. This must be run
 #as root in order to have a high enough permission to write to the correct
@@ -167,8 +139,8 @@ install: library docs
 #documentation.
 clean:
 	rm -f $(OBJECTS) $(TESTOBJECTS) $(HASHOBJECTS) $(CRYPTOBJECTS) $(SMALLOBJECTS) $(LEFTOVERS) $(LIBNAME)
-	rm -f $(TEST) $(HASH) $(COMPRESSED)
-	rm -f *stackdump *.lib *.exe *.obj demos/*.obj demos/*.o *.bat
+	rm -f $(TEST) $(HASH) $(COMPRESSED) $(PROFS) $(PROF) $(TVS) $(TV)
+	rm -f *stackdump *.lib *.exe *.obj demos/*.obj demos/*.o *.bat hash_tv.txt cipher_tv.txt
 
 #This builds the crypt.pdf file. Note that the rm -f *.pdf has been removed
 #from the clean command! This is because most people would like to keep the
@@ -181,6 +153,12 @@ docs: crypt.tex
 	pdflatex crypt > /dev/null
 	rm -f $(LEFTOVERS)
        
+#beta
+beta: clean
+	cd .. ; rm -rf crypt* libtomcrypt-$(VERSION)-beta ; mkdir libtomcrypt-$(VERSION)-beta ; \
+	cp -R ./libtomcrypt/* ./libtomcrypt-$(VERSION)-beta/ ; tar -c libtomcrypt-$(VERSION)-beta/* > crypt-$(VERSION)-beta.tar ; \
+	bzip2 -9vv crypt-$(VERSION)-beta.tar ; zip -9 -r crypt-$(VERSION)-beta.zip libtomcrypt-$(VERSION)-beta/*
+
 #zipup the project (take that!)
 zipup: clean docs
 	cd .. ; rm -rf crypt* libtomcrypt-$(VERSION) ; mkdir libtomcrypt-$(VERSION) ; \
diff --git a/makefile.msvc b/makefile.msvc
index 3af1440..c5ab49c 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -12,7 +12,7 @@ OBJECTS=keyring.obj gf.obj mem.obj sprng.obj ecc.obj base64.obj dh.obj rsa.obj \
 bits.obj yarrow.obj cfb.obj ofb.obj ecb.obj ctr.obj cbc.obj hash.obj tiger.obj sha1.obj \
 md5.obj md4.obj md2.obj sha256.obj sha512.obj xtea.obj aes.obj des.obj \
 safer_tab.obj safer.obj safer+.obj rc4.obj rc2.obj rc6.obj rc5.obj cast5.obj noekeon.obj \
-blowfish.obj crypt.obj mpi.obj prime.obj twofish.obj packet.obj hmac.obj strings.obj 
+blowfish.obj crypt.obj mpi.obj prime.obj twofish.obj packet.obj hmac.obj strings.obj
 
 library: $(OBJECTS)
 	lib /out:tomcrypt.lib $(OBJECTS)
@@ -24,4 +24,7 @@ test: library test.obj
 	cl test.obj tomcrypt.lib advapi32.lib	
 	
 x86_prof: demos/x86_prof.c library
-	cl $(CFLAGS) demos/x86_prof.c tomcrypt.lib advapi32.lib 
\ No newline at end of file
+	cl $(CFLAGS) demos/x86_prof.c tomcrypt.lib advapi32.lib 
+
+tv_gen: demos/tv_gen.c library
+	cl $(CFLAGS) demos/tv_gen.c tomcrypt.lib advapi32.lib 
diff --git a/md5.c b/md5.c
index 71481ae..1a4dc95 100644
--- a/md5.c
+++ b/md5.c
@@ -20,16 +20,16 @@ const struct _hash_descriptor md5_desc =
 #define I(x,y,z)  (y^(x|(~z)))
 
 #define FF(a,b,c,d,M,s,t) \
-    a = (a + F(b,c,d) + M + t); a = ROL(a, s); a = (b + a);
+    a = (a + F(b,c,d) + M + t); a = ROL(a, s) + b;
 
 #define GG(a,b,c,d,M,s,t) \
-    a = (a + G(b,c,d) + M + t); a = ROL(a, s); a = (b + a);
+    a = (a + G(b,c,d) + M + t); a = ROL(a, s) + b;
 
 #define HH(a,b,c,d,M,s,t) \
-    a = (a + H(b,c,d) + M + t); a = ROL(a, s); a = (b + a);
+    a = (a + H(b,c,d) + M + t); a = ROL(a, s) + b;
 
 #define II(a,b,c,d,M,s,t) \
-    a = (a + I(b,c,d) + M + t); a = ROL(a, s); a = (b + a);
+    a = (a + I(b,c,d) + M + t); a = ROL(a, s) + b;
 
 #ifdef CLEAN_STACK
 static void _md5_compress(hash_state *md)
diff --git a/mpi.c b/mpi.c
index d427eaa..b92e915 100644
--- a/mpi.c
+++ b/mpi.c
@@ -14,6 +14,7 @@
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */
 #include "mycrypt.h"
+#include <tommath.h>
 
 /* computes the modular inverse via binary extended euclidean algorithm, 
  * that is c = 1/a mod b 
@@ -67,8 +68,8 @@ top:
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&B) == 0) {
+    /* 4.2 if B is odd then */
+    if (mp_isodd (&B) == 1) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
         goto __ERR;
       }
@@ -85,8 +86,8 @@ top:
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&D) == 0) {
+    /* 5.2 if D is odd then */
+    if (mp_isodd (&D) == 1) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
         goto __ERR;
@@ -216,7 +217,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      * that W[ix-1] have  the carry cleared (see after the inner loop)
      */
     register mp_digit mu;
-    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
+    mu = MULT(W[ix] & MP_MASK, rho) & MP_MASK;
 
     /* a = a + mu * m * b**i
      *
@@ -245,7 +246,7 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
       /* inner loop */
       for (iy = 0; iy < n->used; iy++) {
-          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
+          *_W++ += MULT(mu, *tmpn++);
       }
     }
 
@@ -383,7 +384,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       pb = MIN (b->used, digs - ix);
 
       for (iy = 0; iy < pb; iy++) {
-        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += MULT(tmpx, *tmpy++);
       }
     }
 
@@ -500,7 +501,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* compute column products for digits above the minimum */
       for (; iy < pb; iy++) {
-    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+         *_W++ += MULT(tmpx, *tmpy++);
       }
     }
   }
@@ -596,7 +597,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
      * for a particular column only once which means that
      * there is no need todo a double precision addition
      */
-    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    W2[ix + ix] = MULT(a->dp[ix], a->dp[ix]);
 
     {
       register mp_digit tmpx, *tmpy;
@@ -614,7 +615,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 
       /* inner products */
       for (iy = ix + 1; iy < pa; iy++) {
-          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += MULT(tmpx, *tmpy++);
       }
     }
   }
@@ -1116,6 +1117,50 @@ mp_cmp_mag (mp_int * a, mp_int * b)
 
 /* End: bn_mp_cmp_mag.c */
 
+/* Start: bn_mp_cnt_lsb.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a)
+{
+   int x;
+   mp_digit q;
+
+   if (mp_iszero(a) == 1) {
+      return 0;
+   }
+
+   /* scan lower digits until non-zero */
+   for (x = 0; x < a->used && a->dp[x] == 0; x++);
+   q = a->dp[x];
+   x *= DIGIT_BIT;
+
+   /* now scan this digit until a 1 is found */
+   while ((q & 1) == 0) {
+      q >>= 1;
+      x  += 1;
+   }
+
+   return x;
+}
+
+
+/* End: bn_mp_cnt_lsb.c */
+
 /* Start: bn_mp_copy.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -1234,14 +1279,18 @@ mp_count_bits (mp_int * a)
  */
 #include <tommath.h>
 
-/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+/* integer signed division. 
+ * c*b + d == a [e.g. a/b, c=quotient, d=remainder]
  * HAC pp.598 Algorithm 14.20
  *
- * Note that the description in HAC is horribly incomplete.  For example,
- * it doesn't consider the case where digits are removed from 'x' in the inner
- * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
+ * Note that the description in HAC is horribly 
+ * incomplete.  For example, it doesn't consider 
+ * the case where digits are removed from 'x' in 
+ * the inner loop.  It also doesn't consider the 
+ * case that y has fewer than three digits, etc..
  *
- * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
+ * The overall algorithm is as described as 
+ * 14.20 from HAC but fixed to treat these cases.
 */
 int
 mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
@@ -1249,7 +1298,6 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   mp_int  q, x, y, t1, t2;
   int     res, n, t, i, norm, neg;
 
-
   /* is divisor zero ? */
   if (mp_iszero (b) == 1) {
     return MP_VAL;
@@ -1293,7 +1341,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
   x.sign = y.sign = MP_ZPOS;
 
-  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
+  /* normalize both x and y, ensure that y >= b/2, [b == 2**DIGIT_BIT] */
   norm = mp_count_bits(&y) % DIGIT_BIT;
   if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
@@ -1311,8 +1359,8 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   n = x.used - 1;
   t = y.used - 1;
 
-  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
+  /* while (x >= y*b**n-t) do { q[n-t] += 1; x -= y*b**{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b**{n-t} */
     goto __Y;
   }
 
@@ -1331,7 +1379,8 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
     if (i > x.used)
       continue;
 
-    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
+     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
     if (x.dp[i] == y.dp[t]) {
       q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
     } else {
@@ -1344,7 +1393,11 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
     }
 
-    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
+    /* while (q{i-t-1} * (yt * b + y{t-1})) > 
+             xi * b**2 + xi-1 * b + xi-2 
+     
+       do q{i-t-1} -= 1; 
+    */
     q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
     do {
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
@@ -1365,7 +1418,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       t2.used = 3;
     } while (mp_cmp_mag(&t1, &t2) == MP_GT);
 
-    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
+    /* step 3.3 x = x - q{i-t-1} * y * b**{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
       goto __Y;
     }
@@ -1378,7 +1431,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       goto __Y;
     }
 
-    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
+    /* if x < 0 then { x = x + y*b**{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
         goto __Y;
@@ -1394,7 +1447,10 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
     }
   }
 
-  /* now q is the quotient and x is the remainder [which we have to normalize] */
+  /* now q is the quotient and x is the remainder 
+   * [which we have to normalize] 
+   */
+  
   /* get sign before writing to c */
   x.sign = a->sign;
 
@@ -1549,11 +1605,14 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   /* shift any bit count < DIGIT_BIT */
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
-    register mp_digit *tmpc, mask;
+    register mp_digit *tmpc, mask, shift;
 
     /* mask */
     mask = (((mp_digit)1) << D) - 1;
 
+    /* shift for lsb */
+    shift = DIGIT_BIT - D;
+
     /* alias */
     tmpc = c->dp + (c->used - 1);
 
@@ -1564,7 +1623,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
       rr = *tmpc & mask;
 
       /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      *tmpc = (*tmpc >> D) | (r << shift);
       --tmpc;
 
       /* set the carry to the carry bits of the current word found above */
@@ -1582,70 +1641,70 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 /* End: bn_mp_div_2d.c */
 
 /* Start: bn_mp_div_3.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* divide by three (based on routine from MPI and the GMP manual) */
-int
-mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
-{
-  mp_int   q;
-  mp_word  w, t;
-  mp_digit b;
-  int      res, ix;
-  
-  /* b = 2**DIGIT_BIT / 3 */
-  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
-
-  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-     return res;
-  }
-  
-  q.used = a->used;
-  q.sign = a->sign;
-  w = 0;
-  for (ix = a->used - 1; ix >= 0; ix--) {
-     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
-     
-     if (w >= 3) {
-        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
-        w -= (t << ((mp_word)1)) + t;
-        while (w >= 3) {
-           t += 1;
-           w -= 3;
-        }
-      } else {
-        t = 0;
-      }
-      q.dp[ix] = (mp_digit)t;
-  }
-  
-  if (d != NULL) {
-     *d = (mp_digit)w;
-  }
-  
-  if (c != NULL) {
-     mp_clamp(&q);
-     mp_exch(&q, c);
-  }
-  mp_clear(&q);
-  
-  return res;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= 3) {
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+        w -= (t << ((mp_word)1)) + t;
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = (mp_digit)t;
+  }
+  
+  if (d != NULL) {
+     *d = (mp_digit)w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
 
 /* End: bn_mp_div_3.c */
 
@@ -1671,7 +1730,8 @@ int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 {
   mp_int  q;
-  mp_word w, t;
+  mp_word w;
+  mp_digit t;
   int     res, ix;
   
   if (b == 0) {
@@ -1693,7 +1753,7 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
      w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
      
      if (w >= b) {
-        t = w / b;
+        t = (mp_digit)(w / b);
         w = w % b;
       } else {
         t = 0;
@@ -1718,40 +1778,40 @@ mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 /* End: bn_mp_div_d.c */
 
 /* Start: bn_mp_dr_is_modulus.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines if a number is a valid DR modulus */
-int mp_dr_is_modulus(mp_int *a)
-{
-   int ix;
-
-   /* must be at least two digits */
-   if (a->used < 2) {
-      return 0;
-   }
-
-   for (ix = 1; ix < a->used; ix++) {
-       if (a->dp[ix] != MP_MASK) {
-          return 0;
-       }
-   }
-   return 1;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
 
 /* End: bn_mp_dr_is_modulus.c */
 
@@ -1817,7 +1877,7 @@ top:
   
   /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
   for (i = 0; i < m; i++) {
-      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+      r         = MULT(*tmpx2++, k) + *tmpx1 + mu;
       *tmpx1++  = (mp_digit)(r & MP_MASK);
       mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
   }
@@ -1846,32 +1906,32 @@ top:
 /* End: bn_mp_dr_reduce.c */
 
 /* Start: bn_mp_dr_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines the setup value */
-void mp_dr_setup(mp_int *a, mp_digit *d)
-{
-   /* the casts are required if DIGIT_BIT is one less than
-    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
-    */
-   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
-        ((mp_word)a->dp[0]));
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+        ((mp_word)a->dp[0]));
+}
+
 
 /* End: bn_mp_dr_setup.c */
 
@@ -2066,10 +2126,17 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
  *
  * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
  */
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
 {
-  mp_int  M[256], res;
+  mp_int  M[TAB_SIZE], res;
   mp_digit buf, mp;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@@ -2103,13 +2170,19 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   }
 #endif
 
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
 
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init (&M[x])) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
         mp_clear (&M[y]);
       }
+      mp_clear(&M[1]);
       return err;
     }
   }
@@ -2302,7 +2375,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   err = MP_OKAY;
 __RES:mp_clear (&res);
 __M:
-  for (x = 0; x < (1 << winsize); x++) {
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
     mp_clear (&M[x]);
   }
   return err;
@@ -2310,6 +2384,122 @@ __M:
 
 /* End: bn_mp_exptmod_fast.c */
 
+/* Start: bn_mp_fread.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (mp_s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_fread.c */
+
+/* Start: bn_mp_fwrite.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_fwrite.c */
+
 /* Start: bn_mp_gcd.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -2327,13 +2517,12 @@ __M:
  */
 #include <tommath.h>
 
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
- */
+/* Greatest Common Divisor using the binary method */
 int
 mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  u, v, t;
-  int     k, res, neg;
+  mp_int  u, v;
+  int     k, u_lsb, v_lsb, res;
 
   /* either zero than gcd is the largest */
   if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
@@ -2347,9 +2536,6 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     return MP_OKAY;
   }
 
-  /* if both are negative they share (-1) as a common divisor */
-  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-
   if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
     return res;
   }
@@ -2361,71 +2547,55 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
   /* must be positive for the remainder of the algorithm */
   u.sign = v.sign = MP_ZPOS;
 
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  /* B1.  Find the common power of two for u and v */
+  u_lsb = mp_cnt_lsb(&u);
+  v_lsb = mp_cnt_lsb(&v);
+  k     = MIN(u_lsb, v_lsb);
+
+  if ((res = mp_div_2d(&u, k, &u, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  if ((res = mp_div_2d(&v, k, &v, NULL)) != MP_OKAY) {
+     goto __V;
+  }
+
+  /* divide any remaining factors of two out */
+  if (u_lsb != k) {
+     if ((res = mp_div_2d(&u, u_lsb - k, &u, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+
+  if (v_lsb != k) {
+     if ((res = mp_div_2d(&v, v_lsb - k, &v, NULL)) != MP_OKAY) {
+        goto __V;
+     }
+  }
+  
+  while (mp_iszero(&v) == 0) {
+     /* make sure v is the largest */
+     if (mp_cmp_mag(&u, &v) == MP_GT) {
+        mp_exch(&u, &v);
+     }
+     
+     /* subtract smallest from largest */
+     if ((res = s_mp_sub(&v, &u, &v)) != MP_OKAY) {
+        goto __V;
+     }
+     
+     /* Divide out all factors of two */
+     if ((res = mp_div_2d(&v, mp_cnt_lsb(&v), &v, NULL)) != MP_OKAY) {
+        goto __V;
+     } 
+  } 
+  
+  /* multiply by 2**k which we divided out at the beginning */ 
+  if ((res = mp_mul_2d (&u, k, c)) != MP_OKAY) {
     goto __V;
   }
-
-  /* B1.  Find power of two */
-  k = 0;
-  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
-    ++k;
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __T;
-    }
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  /* B2.  Initialize */
-  if (mp_isodd(&u) == 1) {
-    /* t = -v */
-    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-    t.sign = MP_NEG;
-  } else {
-    /* t = u */
-    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  do {
-    /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && mp_iseven(&t) == 1) {
-      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-        goto __T;
-      }
-    }
-
-    /* B5.  if t>0 then u=t otherwise v=-t */
-    if (t.used != 0 && t.sign != MP_NEG) {
-      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-        goto __T;
-      }
-    } else {
-      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-        goto __T;
-      }
-      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-    }
-
-    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  } while (mp_iszero(&t) == 0);
-
-  /* multiply by 2^k which we divided out at the beginning */ 
-  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
-    goto __T;
-  }
-
-  mp_exch (&u, c);
-  c->sign = neg;
+  c->sign = MP_ZPOS;
   res = MP_OKAY;
-__T:mp_clear (&t);
 __V:mp_clear (&u);
 __U:mp_clear (&v);
   return res;
@@ -2570,7 +2740,7 @@ mp_init_size (mp_int * a, int size)
 {
 
   /* pad size so there are always extra digits */
-  size += (MP_PREC * 2) - (size & (MP_PREC - 1));   
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
   
   /* alloc mem */
   a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
@@ -2603,6 +2773,7 @@ mp_init_size (mp_int * a, int size)
  */
 #include <tommath.h>
 
+/* hac 14.61, pp608 */
 int
 mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
@@ -2610,17 +2781,18 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   int     res;
 
   /* b cannot be negative */
-  if (b->sign == MP_NEG) {
+  if (b->sign == MP_NEG || mp_iszero(b) == 1) {
     return MP_VAL;
   }
 
   /* if the modulus is odd we can use a faster routine instead */
-  if (mp_iseven (b) == 0) {
+  if (mp_isodd (b) == 1) {
     return fast_mp_invmod (a, b, c);
   }
   
   /* init temps */
-  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+  if ((res = mp_init_multi(&x, &y, &u, &v, 
+                           &A, &B, &C, &D, NULL)) != MP_OKAY) {
      return res;
   }
 
@@ -2632,10 +2804,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
     goto __ERR;
   }
 
-  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
@@ -2652,7 +2820,6 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   mp_set (&A, 1);
   mp_set (&D, 1);
 
-
 top:
   /* 4.  while u is even do */
   while (mp_iseven (&u) == 1) {
@@ -2661,13 +2828,13 @@ top:
       goto __ERR;
     }
     /* 4.2 if A or B is odd then */
-    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+    if (mp_isodd (&A) == 1 || mp_isodd (&B) == 1) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-    goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-    goto __ERR;
+         goto __ERR;
       }
     }
     /* A = A/2, B = B/2 */
@@ -2679,21 +2846,20 @@ top:
     }
   }
 
-
   /* 5.  while v is even do */
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
       goto __ERR;
     }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+    /* 5.2 if C or D is odd then */
+    if (mp_isodd (&C) == 1 || mp_isodd (&D) == 1) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-    goto __ERR;
+         goto __ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-    goto __ERR;
+         goto __ERR;
       }
     }
     /* C = C/2, D = D/2 */
@@ -2746,10 +2912,23 @@ top:
     goto __ERR;
   }
 
-  /* a is now the inverse */
+  /* if its too low */
+  while (mp_cmp_d(&C, 0) == MP_LT) {
+      if ((res = mp_add(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* too big */
+  while (mp_cmp_mag(&C, b) != MP_LT) {
+      if ((res = mp_sub(&C, b, &C)) != MP_OKAY) {
+         goto __ERR;
+      }
+  }
+  
+  /* C is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
-
 __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
@@ -2777,10 +2956,10 @@ __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
  * HAC pp. 73 Algorithm 2.149
  */
 int
-mp_jacobi (mp_int * a, mp_int * n, int *c)
+mp_jacobi (mp_int * a, mp_int * p, int *c)
 {
-  mp_int  a1, n1, e;
-  int     s, r, res;
+  mp_int  a1, p1;
+  int     k, s, r, res;
   mp_digit residue;
 
   /* step 1.  if a == 0, return 0 */
@@ -2796,39 +2975,30 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
   }
 
   /* default */
-  s = 0;
+  k = s = 0;
 
-  /* step 3.  write a = a1 * 2^e  */
+  /* step 3.  write a = a1 * 2**k  */
   if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
     return res;
   }
 
-  if ((res = mp_init (&n1)) != MP_OKAY) {
+  if ((res = mp_init (&p1)) != MP_OKAY) {
     goto __A1;
   }
 
-  if ((res = mp_init (&e)) != MP_OKAY) {
-    goto __N1;
-  }
-
   while (mp_iseven (&a1) == 1) {
-    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
-      goto __E;
-    }
-
+    k = k + 1;
     if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
-      goto __E;
+      goto __P1;
     }
   }
 
   /* step 4.  if e is even set s=1 */
-  if (mp_iseven (&e) == 1) {
+  if ((k & 1) == 0) {
     s = 1;
   } else {
-    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
-      goto __E;
-    }
+    /* else set s=1 if p = 1/7 (mod 8) or s=-1 if p = 3/5 (mod 8) */
+    residue = p->dp[0] & 7;
 
     if (residue == 1 || residue == 7) {
       s = 1;
@@ -2837,17 +3007,9 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     }
   }
 
-  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
-    goto __E;
-  }
-  if (residue == 3) {
-    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-    if (residue == 3) {
-      s = -s;
-    }
+  /* step 5.  if p == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ( ((p->dp[0] & 3) == 3) && ((a1.dp[0] & 3) == 3)) {
+    s = -s;
   }
 
   /* if a1 == 1 we're done */
@@ -2855,19 +3017,18 @@ mp_jacobi (mp_int * a, mp_int * n, int *c)
     *c = s;
   } else {
     /* n1 = n mod a1 */
-    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_mod (p, &a1, &p1)) != MP_OKAY) {
+      goto __P1;
     }
-    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
-      goto __E;
+    if ((res = mp_jacobi (&p1, &a1, &r)) != MP_OKAY) {
+      goto __P1;
     }
     *c = s * r;
   }
 
   /* done */
   res = MP_OKAY;
-__E:mp_clear (&e);
-__N1:mp_clear (&n1);
+__P1:mp_clear (&p1);
 __A1:mp_clear (&a1);
   return res;
 }
@@ -3505,7 +3666,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 
   for (ix = 0; ix < n->used; ix++) {
     /* mu = ai * m' mod b */
-    mu = (x->dp[ix] * rho) & MP_MASK;
+    mu = MULT(x->dp[ix], rho) & MP_MASK;
 
     /* a = a + mu * m * b**i */
     {
@@ -3522,7 +3683,7 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
       
       /* Multiply and add in place */
       for (iy = 0; iy < n->used; iy++) {
-        r       = ((mp_word) mu) * ((mp_word) * tmpn++) + 
+        r       = MULT(mu, *tmpn++) +
                   ((mp_word) u) + ((mp_word) * tmpx);
         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
         *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
@@ -3758,12 +3919,6 @@ mp_mul_2 (mp_int * a, mp_int * b)
  */
 #include <tommath.h>
 
-/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
-   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
- 
-   To be fixed for LTM 0.18
- */
-
 /* shift left by a certain bit count */
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
@@ -3778,8 +3933,8 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
      }
   }
 
-  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
-     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 1)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
        return res;
      }
   }
@@ -3790,17 +3945,19 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       return res;
     }
   }
-  c->used = c->alloc;
 
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    register mp_digit *tmpc, mask, r, rr;
+    register mp_digit *tmpc, shift, mask, r, rr;
     register int x;
 
     /* bitmask for carries */
     mask = (((mp_digit)1) << d) - 1;
 
+    /* shift for msbs */
+    shift = DIGIT_BIT - d;
+
     /* alias */
     tmpc = c->dp;
 
@@ -3808,7 +3965,7 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
     r    = 0;
     for (x = 0; x < c->used; x++) {
       /* get the higher bits of the current word */
-      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+      rr = (*tmpc >> shift) & mask;
 
       /* shift the current word and OR in the carry */
       *tmpc = ((*tmpc << d) | r) & MP_MASK;
@@ -3817,6 +3974,11 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
       /* set the carry to the carry bits of the current word */
       r = rr;
     }
+    
+    /* set final carry */
+    if (r != 0) {
+       c->dp[c->used++] = r;
+    }
   }
   mp_clamp (c);
   return MP_OKAY;
@@ -3860,6 +4022,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 
   /* set the new temporary used count */
   c->used = pa + 1;
+  c->sign = a->sign;
 
   {
     register mp_digit u, *tmpa, *tmpc;
@@ -3876,7 +4039,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     u = 0;
     for (ix = 0; ix < pa; ix++) {
       /* compute product and carry sum for this term */
-      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+      r = ((mp_word) u) + MULT(*tmpa++, b);
 
       /* mask off higher bits to get a single digit */
       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -4026,11 +4189,13 @@ void mp_clear_multi(mp_int *mp, ...)
 
 /* find the n'th root of an integer 
  *
- * Result found such that (c)^b <= a and (c+1)^b > a 
+ * Result found such that (c)**b <= a and (c+1)**b > a 
  *
- * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
- * which will find the root in log(N) time where each step involves a fair bit.  This
- * is not meant to find huge roots [square and cube at most].
+ * This algorithm uses Newton's approximation 
+ * x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where 
+ * each step involves a fair bit.  This is not meant to 
+ * find huge roots [square and cube, etc].
  */
 int
 mp_n_root (mp_int * a, mp_digit b, mp_int * c)
@@ -4068,33 +4233,39 @@ mp_n_root (mp_int * a, mp_digit b, mp_int * c)
       goto __T3;
     }
 
-    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
-    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   /* t3 = t1^(b-1) */
+    /* t2 = t1 - ((t1**b - a) / (b * t1**(b-1))) */
+    
+    /* t3 = t1**(b-1) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {   
       goto __T3;
     }
 
     /* numerator */
-    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    /* t2 = t1^b */
+    /* t2 = t1**b */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {    
       goto __T3;
     }
 
-    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  /* t2 = t1^b - a */
+    /* t2 = t1**b - a */
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {  
       goto __T3;
     }
 
-    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    /* t3 = t1^(b-1) * b  */
+    /* denominator */
+    /* t3 = t1**(b-1) * b  */
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {    
       goto __T3;
     }
 
-    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  /* t3 = (t1^b - a)/(b * t1^(b-1)) */
+    /* t3 = (t1**b - a)/(b * t1**(b-1)) */
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {  
       goto __T3;
     }
 
     if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
       goto __T3;
     }
-  }
-  while (mp_cmp (&t1, &t2) != MP_EQ);
+  }  while (mp_cmp (&t1, &t2) != MP_EQ);
 
   /* result can be off by a few so check */
   for (;;) {
@@ -4229,9 +4400,9 @@ mp_or (mp_int * a, mp_int * b, mp_int * c)
 
 /* performs one Fermat test.
  * 
- * If "a" were prime then b^a == b (mod a) since the order of
+ * If "a" were prime then b**a == b (mod a) since the order of
  * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
+ * it would be the same as b**(a mod (a-1)) == b**1 == b (mod a).
  *
  * Sets result to 1 if the congruence holds, or zero otherwise.
  */
@@ -4249,7 +4420,7 @@ mp_prime_fermat (mp_int * a, mp_int * b, int *result)
     return err;
   }
 
-  /* compute t = b^a mod a */
+  /* compute t = b**a mod a */
   if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
     goto __T;
   }
@@ -4283,7 +4454,8 @@ __T:mp_clear (&t);
  */
 #include <tommath.h>
 
-/* determines if an integers is divisible by one of the first 256 primes or not
+/* determines if an integers is divisible by one 
+ * of the first PRIME_SIZE primes or not
  *
  * sets result to 0 if not, 1 if yes
  */
@@ -4297,12 +4469,6 @@ mp_prime_is_divisible (mp_int * a, int *result)
   *result = 0;
 
   for (ix = 0; ix < PRIME_SIZE; ix++) {
-    /* is it equal to the prime? */
-    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
-      *result = 1;
-      return MP_OKAY;
-    }
-
     /* what is a mod __prime_tab[ix] */
     if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
       return err;
@@ -4441,19 +4607,17 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     goto __N1;
   }
 
-  /* set 2^s * r = n1 */
+  /* set 2**s * r = n1 */
   if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
     goto __N1;
   }
-  s = 0;
-  while (mp_iseven (&r) == 1) {
-    ++s;
-    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
-      goto __R;
-    }
+ 
+  s = mp_cnt_lsb(&r);
+  if ((err = mp_div_2d (&r, s, &r, NULL)) != MP_OKAY) {
+    goto __R;
   }
 
-  /* compute y = b^r mod a */
+  /* compute y = b**r mod a */
   if ((err = mp_init (&y)) != MP_OKAY) {
     goto __R;
   }
@@ -4467,12 +4631,12 @@ mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
     /* while j <= s-1 and y != n1 */
     while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
       if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-    goto __Y;
+         goto __Y;
       }
 
       /* if y == 1 then composite */
       if (mp_cmp_d (&y, 1) == MP_EQ) {
-    goto __Y;
+         goto __Y;
       }
 
       ++j;
@@ -4552,6 +4716,86 @@ int mp_prime_next_prime(mp_int *a, int t)
 
 /* End: bn_mp_prime_next_prime.c */
 
+/* Start: bn_mp_radix_size.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
+
+
+/* End: bn_mp_radix_size.c */
+
+/* Start: bn_mp_radix_smap.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+const char *mp_s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+/* End: bn_mp_radix_smap.c */
+
 /* Start: bn_mp_rand.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -4605,6 +4849,87 @@ mp_rand (mp_int * a, int digits)
 
 /* End: bn_mp_rand.c */
 
+/* Start: bn_mp_read_radix.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  /* make sure the radix is ok */
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* if the leading digit is a 
+   * minus set the sign to negative. 
+   */
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  /* set the integer to the default of zero */
+  mp_zero (a);
+  
+  /* process each digit of the string */
+  while (*str) {
+    /* if the radix < 36 the conversion is case insensitive
+     * this allows numbers like 1AB and 1ab to represent the same  value
+     * [e.g. in hex]
+     */
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == mp_s_rmap[y]) {
+         break;
+      }
+    }
+
+    /* if the char was found in the map 
+     * and is less than the given radix add it
+     * to the number, otherwise exit the loop. 
+     */
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+         return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+         return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  
+  /* set the sign only if a != 0 */
+  if (mp_iszero(a) != 1) {
+     a->sign = neg;
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_radix.c */
+
 /* Start: bn_mp_read_signed_bin.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -4769,183 +5094,183 @@ CLEANUP:
 /* End: bn_mp_reduce.c */
 
 /* Start: bn_mp_reduce_2k.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reduces a modulo n where n is of the form 2**p - k */
-int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
-{
-   mp_int q;
-   int    p, res;
-   
-   if ((res = mp_init(&q)) != MP_OKAY) {
-      return res;
-   }
-   
-   p = mp_count_bits(n);    
-top:
-   /* q = a/2**p, a = a mod 2**p */
-   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-   
-   if (k != 1) {
-      /* q = q * k */
-      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
-         goto ERR;
-      }
-   }
-   
-   /* a = a + q */
-   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
-      goto ERR;
-   }
-   
-   if (mp_cmp_mag(a, n) != MP_LT) {
-      s_mp_sub(a, n, a);
-      goto top;
-   }
-   
-ERR:
-   mp_clear(&q);
-   return res;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reduces a modulo n where n is of the form 2**p - k */
+int
+mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (k != 1) {
+      /* q = q * k */
+      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
+         goto ERR;
+      }
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
 
 /* End: bn_mp_reduce_2k.c */
 
 /* Start: bn_mp_reduce_2k_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines the setup value */
-int 
-mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-{
-   int res, p;
-   mp_int tmp;
-   
-   if ((res = mp_init(&tmp)) != MP_OKAY) {
-      return res;
-   }
-   
-   p = mp_count_bits(a);
-   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return res;
-   }
-   
-   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
-      mp_clear(&tmp);
-      return res;
-   }
-   
-   *d = tmp.dp[0];
-   mp_clear(&tmp);
-   return MP_OKAY;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+int 
+mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+{
+   int res, p;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(a);
+   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   *d = tmp.dp[0];
+   mp_clear(&tmp);
+   return MP_OKAY;
+}
 
 /* End: bn_mp_reduce_2k_setup.c */
 
 /* Start: bn_mp_reduce_is_2k.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines if mp_reduce_2k can be used */
-int 
-mp_reduce_is_2k(mp_int *a)
-{
-   int ix, iy;
-   
-   if (a->used == 0) {
-      return 0;
-   } else if (a->used == 1) {
-      return 1;
-   } else if (a->used > 1) {
-      iy = mp_count_bits(a);
-      for (ix = DIGIT_BIT; ix < iy; ix++) {
-          if ((a->dp[ix/DIGIT_BIT] & 
-              ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
-             return 0;
-          }
-      }
-   }
-   return 1;
-}
-
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if mp_reduce_2k can be used */
+int 
+mp_reduce_is_2k(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return 0;
+   } else if (a->used == 1) {
+      return 1;
+   } else if (a->used > 1) {
+      iy = mp_count_bits(a);
+      for (ix = DIGIT_BIT; ix < iy; ix++) {
+          if ((a->dp[ix/DIGIT_BIT] & 
+              ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
+             return 0;
+          }
+      }
+   }
+   return 1;
+}
+
 
 /* End: bn_mp_reduce_is_2k.c */
 
 /* Start: bn_mp_reduce_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* pre-calculate the value required for Barrett reduction
- * For a given modulus "b" it calulates the value required in "a"
- */
-int
-mp_reduce_setup (mp_int * a, mp_int * b)
-{
-  int     res;
-  
-  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
-    return res;
-  }
-  return mp_div (a, b, a, NULL);
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int
+mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int     res;
+  
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  return mp_div (a, b, a, NULL);
+}
 
 /* End: bn_mp_reduce_setup.c */
 
@@ -5449,506 +5774,580 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 /* End: bn_mp_to_unsigned_bin.c */
 
 /* Start: bn_mp_toom_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplication using the Toom-Cook 3-way algorithm */
-int 
-mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-{
-    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-    int res, B;
-        
-    /* init temps */
-    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
-                             &a0, &a1, &a2, &b0, &b1, 
-                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
-       return res;
-    }
-    
-    /* B */
-    B = MIN(a->used, b->used) / 3;
-    
-    /* a = a2 * B**2 + a1 * B + a0 */
-    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a1, B);
-    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-
-    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a2, B*2);
-    
-    /* b = b2 * B**2 + b1 * B + b0 */
-    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&b1, B);
-    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-
-    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&b2, B*2);
-    
-    /* w0 = a0*b0 */
-    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w4 = a2 * b2 */
-    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-
-    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* now solve the matrix 
-    
-       0  0  0  0  1
-       1  2  4  8  16
-       1  1  1  1  1
-       16 8  4  2  1
-       1  0  0  0  0
-       
-       using 12 subtractions, 4 shifts, 
-              2 small divisions and 1 small multiplication 
-     */
-     
-     /* r1 - r4 */
-     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r0 */
-     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/2 */
-     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/2 */
-     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r2 - r0 - r4 */
-     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - 8r0 */
-     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - 8r4 */
-     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* 3r2 - r1 - r3 */
-     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/3 */
-     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/3 */
-     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     
-     /* at this point shift W[n] by B*n */
-     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
-        goto ERR;
-     }     
-     
-     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
-        goto ERR;
-     }     
-     
-ERR:
-     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
-                    &a0, &a1, &a2, &b0, &b1, 
-                    &b2, &tmp1, &tmp2, NULL);
-     return res;
-}     
-     
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplication using the Toom-Cook 3-way algorithm */
+int 
+mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+                             &a0, &a1, &a2, &b0, &b1, 
+                             &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
+       return res;
+    }
+    
+    /* B */
+    B = MIN(a->used, b->used) / 3;
+    
+    /* a = a2 * B**2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+    
+    /* b = b2 * B**2 + b1 * B + b0 */
+    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b1, B);
+    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+
+    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b2, B*2);
+    
+    /* w0 = a0*b0 */
+    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * b2 */
+    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 
+              2 small divisions and 1 small multiplication 
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+                    &a0, &a1, &a2, &b0, &b1, 
+                    &b2, &tmp1, &tmp2, NULL);
+     return res;
+}     
+     
 
 /* End: bn_mp_toom_mul.c */
 
 /* Start: bn_mp_toom_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* squaring using Toom-Cook 3-way algorithm */
-int 
-mp_toom_sqr(mp_int *a, mp_int *b)
-{
-    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
-    int res, B;
-        
-    /* init temps */
-    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
-       return res;
-    }
-
-    /* B */
-    B = a->used / 3;
-    
-    /* a = a2 * B^2 + a1 * B + a0 */
-    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
-       goto ERR;
-    }
-
-    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a1, B);
-    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-
-    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
-       goto ERR;
-    }
-    mp_rshd(&a2, B*2);
-        
-    /* w0 = a0*a0 */
-    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w4 = a2 * a2 */
-    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w1 = (a2 + 2(a1 + 2a0))**2 */
-    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* w3 = (a0 + 2(a1 + 2a2))**2 */
-    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-
-    /* w2 = (a2 + a1 + a0)**2 */
-    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
-       goto ERR;
-    }
-    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
-       goto ERR;
-    }
-    
-    /* now solve the matrix 
-    
-       0  0  0  0  1
-       1  2  4  8  16
-       1  1  1  1  1
-       16 8  4  2  1
-       1  0  0  0  0
-       
-       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
-     */
-     
-     /* r1 - r4 */
-     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r0 */
-     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/2 */
-     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/2 */
-     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r2 - r0 - r4 */
-     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - 8r0 */
-     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - 8r4 */
-     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* 3r2 - r1 - r3 */
-     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1 - r2 */
-     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3 - r2 */
-     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r1/3 */
-     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     /* r3/3 */
-     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
-        goto ERR;
-     }
-     
-     /* at this point shift W[n] by B*n */
-     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
-        goto ERR;
-     }     
-     
-     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
-        goto ERR;
-     }
-     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
-        goto ERR;
-     }     
-     
-ERR:
-     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
-     return res;
-}     
-     
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* squaring using Toom-Cook 3-way algorithm */
+int 
+mp_toom_sqr(mp_int *a, mp_int *b)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
+       return res;
+    }
+
+    /* B */
+    B = a->used / 3;
+    
+    /* a = a2 * B^2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+        
+    /* w0 = a0*a0 */
+    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * a2 */
+    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))**2 */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))**2 */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)**2 */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
+     return res;
+}     
+     
 
 /* End: bn_mp_toom_sqr.c */
 
+/* Start: bn_mp_toradix.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+  
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+  
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+  
+  /* append a NULL so the string is properly terminated */
+  *str++ = '\0';
+  
+  
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+
+/* End: bn_mp_toradix.c */
+
 /* Start: bn_mp_unsigned_bin_size.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6053,6 +6452,216 @@ mp_zero (mp_int * a)
 
 /* End: bn_mp_zero.c */
 
+/* Start: bn_mult.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+ #include <tommath.h>
+
+/* this file provides a nxn=>2n multiplier based on the
+ * fact that xy = ((x-y)^2 - (x+y)^2)/4
+ * so by having a square table for 0..2^(n+1)
+ * we can compute (x+y)^2 via table lookup, etc..
+ */
+
+#ifdef SLOW_MULT
+
+/* table of x^2 for -510..510 */
+#if defined(MP_8BIT) || defined(MP_16BIT)
+static const unsigned long sqr[] = {
+65025, 64770, 64516, 64262, 64009, 63756, 63504, 63252, 63001, 62750, 62500, 62250,
+62001, 61752, 61504, 61256, 61009, 60762, 60516, 60270, 60025, 59780, 59536, 59292,
+59049, 58806, 58564, 58322, 58081, 57840, 57600, 57360, 57121, 56882, 56644, 56406,
+56169, 55932, 55696, 55460, 55225, 54990, 54756, 54522, 54289, 54056, 53824, 53592,
+53361, 53130, 52900, 52670, 52441, 52212, 51984, 51756, 51529, 51302, 51076, 50850,
+50625, 50400, 50176, 49952, 49729, 49506, 49284, 49062, 48841, 48620, 48400, 48180,
+47961, 47742, 47524, 47306, 47089, 46872, 46656, 46440, 46225, 46010, 45796, 45582,
+45369, 45156, 44944, 44732, 44521, 44310, 44100, 43890, 43681, 43472, 43264, 43056,
+42849, 42642, 42436, 42230, 42025, 41820, 41616, 41412, 41209, 41006, 40804, 40602,
+40401, 40200, 40000, 39800, 39601, 39402, 39204, 39006, 38809, 38612, 38416, 38220,
+38025, 37830, 37636, 37442, 37249, 37056, 36864, 36672, 36481, 36290, 36100, 35910,
+35721, 35532, 35344, 35156, 34969, 34782, 34596, 34410, 34225, 34040, 33856, 33672,
+33489, 33306, 33124, 32942, 32761, 32580, 32400, 32220, 32041, 31862, 31684, 31506,
+31329, 31152, 30976, 30800, 30625, 30450, 30276, 30102, 29929, 29756, 29584, 29412,
+29241, 29070, 28900, 28730, 28561, 28392, 28224, 28056, 27889, 27722, 27556, 27390,
+27225, 27060, 26896, 26732, 26569, 26406, 26244, 26082, 25921, 25760, 25600, 25440,
+25281, 25122, 24964, 24806, 24649, 24492, 24336, 24180, 24025, 23870, 23716, 23562,
+23409, 23256, 23104, 22952, 22801, 22650, 22500, 22350, 22201, 22052, 21904, 21756,
+21609, 21462, 21316, 21170, 21025, 20880, 20736, 20592, 20449, 20306, 20164, 20022,
+19881, 19740, 19600, 19460, 19321, 19182, 19044, 18906, 18769, 18632, 18496, 18360,
+18225, 18090, 17956, 17822, 17689, 17556, 17424, 17292, 17161, 17030, 16900, 16770,
+16641, 16512, 16384, 16256, 16129, 16002, 15876, 15750, 15625, 15500, 15376, 15252,
+15129, 15006, 14884, 14762, 14641, 14520, 14400, 14280, 14161, 14042, 13924, 13806,
+13689, 13572, 13456, 13340, 13225, 13110, 12996, 12882, 12769, 12656, 12544, 12432,
+12321, 12210, 12100, 11990, 11881, 11772, 11664, 11556, 11449, 11342, 11236, 11130,
+11025, 10920, 10816, 10712, 10609, 10506, 10404, 10302, 10201, 10100, 10000,  9900,
+ 9801,  9702,  9604,  9506,  9409,  9312,  9216,  9120,  9025,  8930,  8836,  8742,
+ 8649,  8556,  8464,  8372,  8281,  8190,  8100,  8010,  7921,  7832,  7744,  7656,
+ 7569,  7482,  7396,  7310,  7225,  7140,  7056,  6972,  6889,  6806,  6724,  6642,
+ 6561,  6480,  6400,  6320,  6241,  6162,  6084,  6006,  5929,  5852,  5776,  5700,
+ 5625,  5550,  5476,  5402,  5329,  5256,  5184,  5112,  5041,  4970,  4900,  4830,
+ 4761,  4692,  4624,  4556,  4489,  4422,  4356,  4290,  4225,  4160,  4096,  4032,
+ 3969,  3906,  3844,  3782,  3721,  3660,  3600,  3540,  3481,  3422,  3364,  3306,
+ 3249,  3192,  3136,  3080,  3025,  2970,  2916,  2862,  2809,  2756,  2704,  2652,
+ 2601,  2550,  2500,  2450,  2401,  2352,  2304,  2256,  2209,  2162,  2116,  2070,
+ 2025,  1980,  1936,  1892,  1849,  1806,  1764,  1722,  1681,  1640,  1600,  1560,
+ 1521,  1482,  1444,  1406,  1369,  1332,  1296,  1260,  1225,  1190,  1156,  1122,
+ 1089,  1056,  1024,   992,   961,   930,   900,   870,   841,   812,   784,   756,
+  729,   702,   676,   650,   625,   600,   576,   552,   529,   506,   484,   462,
+  441,   420,   400,   380,   361,   342,   324,   306,   289,   272,   256,   240,
+  225,   210,   196,   182,   169,   156,   144,   132,   121,   110,   100,    90,
+   81,    72,    64,    56,    49,    42,    36,    30,    25,    20,    16,    12,
+    9,     6,     4,     2,     1,     0,     0,     0,     1,     2,     4,     6,
+    9,    12,    16,    20,    25,    30,    36,    42,    49,    56,    64,    72,
+   81,    90,   100,   110,   121,   132,   144,   156,   169,   182,   196,   210,
+  225,   240,   256,   272,   289,   306,   324,   342,   361,   380,   400,   420,
+  441,   462,   484,   506,   529,   552,   576,   600,   625,   650,   676,   702,
+  729,   756,   784,   812,   841,   870,   900,   930,   961,   992,  1024,  1056,
+ 1089,  1122,  1156,  1190,  1225,  1260,  1296,  1332,  1369,  1406,  1444,  1482,
+ 1521,  1560,  1600,  1640,  1681,  1722,  1764,  1806,  1849,  1892,  1936,  1980,
+ 2025,  2070,  2116,  2162,  2209,  2256,  2304,  2352,  2401,  2450,  2500,  2550,
+ 2601,  2652,  2704,  2756,  2809,  2862,  2916,  2970,  3025,  3080,  3136,  3192,
+ 3249,  3306,  3364,  3422,  3481,  3540,  3600,  3660,  3721,  3782,  3844,  3906,
+ 3969,  4032,  4096,  4160,  4225,  4290,  4356,  4422,  4489,  4556,  4624,  4692,
+ 4761,  4830,  4900,  4970,  5041,  5112,  5184,  5256,  5329,  5402,  5476,  5550,
+ 5625,  5700,  5776,  5852,  5929,  6006,  6084,  6162,  6241,  6320,  6400,  6480,
+ 6561,  6642,  6724,  6806,  6889,  6972,  7056,  7140,  7225,  7310,  7396,  7482,
+ 7569,  7656,  7744,  7832,  7921,  8010,  8100,  8190,  8281,  8372,  8464,  8556,
+ 8649,  8742,  8836,  8930,  9025,  9120,  9216,  9312,  9409,  9506,  9604,  9702,
+ 9801,  9900, 10000, 10100, 10201, 10302, 10404, 10506, 10609, 10712, 10816, 10920,
+11025, 11130, 11236, 11342, 11449, 11556, 11664, 11772, 11881, 11990, 12100, 12210,
+12321, 12432, 12544, 12656, 12769, 12882, 12996, 13110, 13225, 13340, 13456, 13572,
+13689, 13806, 13924, 14042, 14161, 14280, 14400, 14520, 14641, 14762, 14884, 15006,
+15129, 15252, 15376, 15500, 15625, 15750, 15876, 16002, 16129, 16256, 16384, 16512,
+16641, 16770, 16900, 17030, 17161, 17292, 17424, 17556, 17689, 17822, 17956, 18090,
+18225, 18360, 18496, 18632, 18769, 18906, 19044, 19182, 19321, 19460, 19600, 19740,
+19881, 20022, 20164, 20306, 20449, 20592, 20736, 20880, 21025, 21170, 21316, 21462,
+21609, 21756, 21904, 22052, 22201, 22350, 22500, 22650, 22801, 22952, 23104, 23256,
+23409, 23562, 23716, 23870, 24025, 24180, 24336, 24492, 24649, 24806, 24964, 25122,
+25281, 25440, 25600, 25760, 25921, 26082, 26244, 26406, 26569, 26732, 26896, 27060,
+27225, 27390, 27556, 27722, 27889, 28056, 28224, 28392, 28561, 28730, 28900, 29070,
+29241, 29412, 29584, 29756, 29929, 30102, 30276, 30450, 30625, 30800, 30976, 31152,
+31329, 31506, 31684, 31862, 32041, 32220, 32400, 32580, 32761, 32942, 33124, 33306,
+33489, 33672, 33856, 34040, 34225, 34410, 34596, 34782, 34969, 35156, 35344, 35532,
+35721, 35910, 36100, 36290, 36481, 36672, 36864, 37056, 37249, 37442, 37636, 37830,
+38025, 38220, 38416, 38612, 38809, 39006, 39204, 39402, 39601, 39800, 40000, 40200,
+40401, 40602, 40804, 41006, 41209, 41412, 41616, 41820, 42025, 42230, 42436, 42642,
+42849, 43056, 43264, 43472, 43681, 43890, 44100, 44310, 44521, 44732, 44944, 45156,
+45369, 45582, 45796, 46010, 46225, 46440, 46656, 46872, 47089, 47306, 47524, 47742,
+47961, 48180, 48400, 48620, 48841, 49062, 49284, 49506, 49729, 49952, 50176, 50400,
+50625, 50850, 51076, 51302, 51529, 51756, 51984, 52212, 52441, 52670, 52900, 53130,
+53361, 53592, 53824, 54056, 54289, 54522, 54756, 54990, 55225, 55460, 55696, 55932,
+56169, 56406, 56644, 56882, 57121, 57360, 57600, 57840, 58081, 58322, 58564, 58806,
+59049, 59292, 59536, 59780, 60025, 60270, 60516, 60762, 61009, 61256, 61504, 61752,
+62001, 62250, 62500, 62750, 63001, 63252, 63504, 63756, 64009, 64262, 64516, 64770,
+65025};
+#endif
+
+#if defined(MP_8BIT)
+/*
+   4 add/sub
+   2 table lookups
+   -
+   6 operations
+
+   versus
+
+    8 shifts
+    8 ands
+    8 jump/zero
+    8 adds
+   --
+   32 operations
+*/
+mp_word s_mp_mult(mp_digit a, mp_digit b)
+{
+   int A, B;
+   /* since mp_digit < 9-bits a+b may truncate... */
+   A = a; B = b;
+   A += 510;
+   return (mp_word)(sqr[A+B] - sqr[A-B]);
+}
+#elif defined(MP_16BIT)
+/*
+   17 add/sub
+    4 shifts
+    8 table lookups
+    2 ands
+   --
+   31 operations
+
+   A double/multiply would require
+
+   16 shifts
+   16 ands
+   16 jump/zero
+   16 adds
+   --
+   64 operations
+ */
+mp_word s_mp_mult(mp_digit a, mp_digit b)
+{
+  mp_digit a1, a2, b1, b2;
+  a1 = a&255;       a2 = a>>8;
+  b1 = (b&255)+510; b2 = (b>>8)+510;
+  return (mp_word)(
+          (sqr[b1+a1] - sqr[b1-a1]) +
+          ((sqr[b1+a2] + sqr[b2+a1] - (sqr[b1-a2] + sqr[b2-a1]))<<8) +
+          ((sqr[b2+a2] - sqr[b2-a2])<<16));
+}
+#elif defined(MP_28BIT)
+/* use a 2-ary sliding window
+
+   29 shifts
+   14 additions
+   13 ands
+   18 table lookups
+   --
+   74 operations
+
+   versus 4*28 == 112 via the other method
+*/
+mp_word s_mp_mult(mp_digit a, mp_digit b)
+{
+   mp_digit wnd[4];
+   mp_word  res;
+
+   /* make window */
+   wnd[0] = 0;
+   wnd[1] = a;
+   wnd[2] = a<<1;
+   wnd[3] = (a<<1) + a;
+
+   /* go over the 28 bits of b */
+#define RND(i)   res = (res << 2) + ((mp_word)wnd[(b>>(2*i))&3]);
+   res = wnd[b>>26];
+   RND(12); RND(11); RND(10); RND( 9);
+   RND( 8); RND( 7); RND( 6); RND( 5);
+   RND( 4); RND( 3); RND( 2); RND( 1); RND(0);
+   return res;
+}
+#else
+mp_word s_mp_mult(mp_digit a, mp_digit b)
+{
+   return ((mp_word)a)*((mp_word)b);
+}
+#endif
+
+#endif /* SLOW_MULT */
+
+/* End: bn_mult.c */
+
 /* Start: bn_prime_tab.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6112,232 +6721,6 @@ const mp_digit __prime_tab[] = {
 
 /* End: bn_prime_tab.c */
 
-/* Start: bn_radix.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* chars used in radix conversions */
-static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-/* read a string [ASCII] in a given radix */
-int
-mp_read_radix (mp_int * a, char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  mp_zero (a);
-  while (*str) {
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == s_rmap[y]) {
-    break;
-      }
-    }
-
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-    return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-    return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-  a->sign = neg;
-  return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int
-mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-  
-  /* quick out if its zero */
-  if (mp_iszero(a) == 1) {
-     *str++ = '0';
-     *str = '\0';
-     return MP_OKAY;
-  }
-  
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = s_rmap[d];
-    ++digs;
-  }
-  bn_reverse ((unsigned char *)_s, digs);
-  *str++ = '\0';
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int
-mp_radix_size (mp_int * a, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  /* special case for binary */
-  if (radix == 2) {
-    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-  }
-
-  if (radix < 2 || radix > 64) {
-    return 0;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return 0;
-  }
-
-  digs = 0;
-  if (t.sign == MP_NEG) {
-    ++digs;
-    t.sign = MP_ZPOS;
-  }
-
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return 0;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-  return digs + 1;
-}
-
-/* read a bigint from a file stream in ASCII */
-int mp_fread(mp_int *a, int radix, FILE *stream)
-{
-   int err, ch, neg, y;
-   
-   /* clear a */
-   mp_zero(a);
-   
-   /* if first digit is - then set negative */
-   ch = fgetc(stream);
-   if (ch == '-') {
-      neg = MP_NEG;
-      ch = fgetc(stream);
-   } else {
-      neg = MP_ZPOS;
-   }
-   
-   for (;;) {
-      /* find y in the radix map */
-      for (y = 0; y < radix; y++) {
-          if (s_rmap[y] == ch) {
-             break;
-          }
-      }
-      if (y == radix) {
-         break;
-      }
-      
-      /* shift up and add */
-      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
-         return err;
-      }
-      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
-         return err;
-      }
-      
-      ch = fgetc(stream);
-   }
-   if (mp_cmp_d(a, 0) != MP_EQ) {
-      a->sign = neg;
-   }
-   
-   return MP_OKAY;
-}
-
-int mp_fwrite(mp_int *a, int radix, FILE *stream)
-{
-   char *buf;
-   int err, len, x;
-   
-   len = mp_radix_size(a, radix);
-   if (len == 0) {
-      return MP_VAL;
-   }
-   
-   buf = malloc(len);
-   if (buf == NULL) {
-      return MP_MEM;
-   }
-   
-   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
-      free(buf);
-      return err;
-   }
-   
-   for (x = 0; x < len; x++) {
-       if (fputc(buf[x], stream) == EOF) {
-          free(buf);
-          return MP_VAL;
-       }
-   }
-   
-   free(buf);
-   return MP_OKAY;
-}
-
-
-/* End: bn_radix.c */
-
 /* Start: bn_reverse.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
@@ -6483,222 +6866,236 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
 /* End: bn_s_mp_add.c */
 
 /* Start: bn_s_mp_exptmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  mp_int  M[256], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-    if (winsize > 5) {
-       winsize = 5;
-    }
-#endif
-
-  /* init M array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the base, 
-   * e.g. M[x] = G**x mod P
-   *
-   * The first half of the table is not 
-   * computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring 
-   * M[1] (winsize-1) times 
-   */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-                       &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = 0;
-  bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-        break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __MU;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __MU;
-      }
-
-      /* empty window and reset */
-      bitcpy = 0;
-      bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+#ifdef MP_LOW_MEM
+   #define TAB_SIZE 32
+#else
+   #define TAB_SIZE 256
+#endif
+
+int
+s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[TAB_SIZE], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  /* init first cell */
+  if ((err = mp_init(&M[1])) != MP_OKAY) {
+     return err; 
+  }
+
+  /* now init the second half of the array */
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    if ((err = mp_init(&M[x])) != MP_OKAY) {
+      for (y = 1<<(winsize-1); y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      mp_clear(&M[1]);
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the base, 
+   * e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not 
+   * computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring 
+   * M[1] (winsize-1) times 
+   */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], 
+                       &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  mp_clear(&M[1]);
+  for (x = 1<<(winsize-1); x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
 
 /* End: bn_s_mp_exptmod.c */
 
@@ -6767,7 +7164,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     for (iy = 0; iy < pb; iy++) {
       /* compute the column as a mp_word */
       r = ((mp_word) *tmpt) + 
-          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          MULT(tmpx, *tmpy++) +
           ((mp_word) u);
 
       /* the new column is the lower part of the result */
@@ -6849,7 +7246,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
     for (iy = digs - ix; iy < pb; iy++) {
       /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+      r = ((mp_word) * tmpt) + MULT(tmpx, *tmpy++) + ((mp_word) u);
 
       /* get the lower part */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -6903,7 +7300,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
     /* first calculate the digit at 2*ix */
     /* calculate double precision result */
     r = ((mp_word) t.dp[2*ix]) + 
-        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+        MULT(a->dp[ix], a->dp[ix]);
 
     /* store lower part in result */
     t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -6919,7 +7316,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
     
     for (iy = ix + 1; iy < pa; iy++) {
       /* first calculate the product */
-      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+      r = MULT(tmpx, a->dp[iy]);
 
       /* now calculate the double precision result, note we use
        * addition instead of *2 since it's easier to optimize
diff --git a/mycrypt.h b/mycrypt.h
index b673d96..d99cb17 100644
--- a/mycrypt.h
+++ b/mycrypt.h
@@ -16,8 +16,8 @@ extern "C" {
 #endif
 
 /* version */
-#define CRYPT   0x0086
-#define SCRYPT  "0.86"
+#define CRYPT   0x0088
+#define SCRYPT  "0.88"
 
 /* max size of either a cipher/hash block or symmetric key [largest of the two] */
 #define MAXBLOCKSIZE           128
diff --git a/mycrypt_cipher.h b/mycrypt_cipher.h
index f7a3419..3071bde 100644
--- a/mycrypt_cipher.h
+++ b/mycrypt_cipher.h
@@ -5,21 +5,21 @@
  */
 #ifdef BLOWFISH
 struct blowfish_key {
-   unsigned long S[4][256];
-   unsigned long K[18];
+   ulong32 S[4][256];
+   ulong32 K[18];
 };
 #endif
 
 #ifdef RC5
 struct rc5_key {
    int rounds;
-   unsigned long K[50];
+   ulong32 K[50];
 };
 #endif
 
 #ifdef RC6
 struct rc6_key {
-   unsigned long K[44];
+   ulong32 K[44];
 };
 #endif
 
@@ -32,7 +32,7 @@ struct saferp_key {
 
 #ifdef RIJNDAEL
 struct rijndael_key {
-   unsigned long eK[64], dK[64];
+   ulong32 eK[64], dK[64];
    int Nr;
 };
 #endif
@@ -46,11 +46,11 @@ struct xtea_key {
 #ifdef TWOFISH
 #ifndef TWOFISH_SMALL
    struct twofish_key {
-      unsigned long S[4][256], K[40];
+      ulong32 S[4][256], K[40];
    };
 #else
    struct twofish_key {
-      unsigned long K[40];
+      ulong32 K[40];
       unsigned char S[32], start;
    };
 #endif
@@ -75,23 +75,23 @@ struct rc2_key { unsigned xkey[64]; };
 
 #ifdef DES
 struct des_key {
-    unsigned long ek[32], dk[32];
+    ulong32 ek[32], dk[32];
 };
 
 struct des3_key {
-    unsigned long ek[3][32], dk[3][32];
+    ulong32 ek[3][32], dk[3][32];
 };
 #endif
 
 #ifdef CAST5
 struct cast5_key {
-    unsigned long K[32], keylen;
+    ulong32 K[32], keylen;
 };
 #endif
 
 #ifdef NOEKEON
 struct noekeon_key {
-    unsigned long K[4], dK[4];
+    ulong32 K[4], dK[4];
 };
 #endif
 
diff --git a/mycrypt_custom.h b/mycrypt_custom.h
index 2429329..b91bde3 100644
--- a/mycrypt_custom.h
+++ b/mycrypt_custom.h
@@ -15,9 +15,8 @@
 #define XFREE free
 #define XCLOCK clock
 #define XCLOCKS_PER_SEC CLOCKS_PER_SEC
-#define TWOFISH_TABLES
-//#define SMALL_CODE
 #define LTC_TEST
+#define SMALL_CODE
 #define BLOWFISH
 #define RC2
 #define RC5
@@ -25,6 +24,7 @@
 #define SAFERP
 #define SAFER
 #define RIJNDAEL
+#define SERPENT
 #define XTEA
 #define TWOFISH
 #define DES
diff --git a/mycrypt_macros.h b/mycrypt_macros.h
index 43c90c7..ba8a9e8 100644
--- a/mycrypt_macros.h
+++ b/mycrypt_macros.h
@@ -7,6 +7,11 @@
    typedef unsigned long long ulong64;
 #endif
 
+/* this is the "32-bit at least" data type 
+ * Re-define it to suit your platform but it must be at least 32-bits 
+ */
+typedef unsigned long ulong32;
+
 extern char *crypt_error;
 
 /* ---- HELPER MACROS ---- */
diff --git a/noekeon.c b/noekeon.c
index 8d6f0c8..ac1cafb 100644
--- a/noekeon.c
+++ b/noekeon.c
@@ -15,7 +15,7 @@ const struct _cipher_descriptor noekeon_desc =
     &noekeon_keysize
 };
 
-static const unsigned long RC[] = {
+static const ulong32 RC[] = {
    0x00000080UL, 0x0000001bUL, 0x00000036UL, 0x0000006cUL,
    0x000000d8UL, 0x000000abUL, 0x0000004dUL, 0x0000009aUL,
    0x0000002fUL, 0x0000005eUL, 0x000000bcUL, 0x00000063UL,
@@ -54,7 +54,7 @@ static const unsigned long RC[] = {
     
 int noekeon_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 {
-   unsigned long temp;
+   ulong32 temp;
    
    _ARGCHK(key != NULL);
    _ARGCHK(skey != NULL);
@@ -88,7 +88,7 @@ static void _noekeon_ecb_encrypt(const unsigned char *pt, unsigned char *ct, sym
 void noekeon_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 #endif
 {
-   unsigned long a,b,c,d,temp;
+   ulong32 a,b,c,d,temp;
    int r;
 
    _ARGCHK(key != NULL);
@@ -105,11 +105,9 @@ void noekeon_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_k
        GAMMA(a,b,c,d); \
        PI2(a,b,c,d);
 
-   for (r = 0; r < 16; r += 4) {
+   for (r = 0; r < 16; r += 2) {
        ROUND(0);
        ROUND(1);
-       ROUND(2);
-       ROUND(3);
    }
 
 #undef ROUND
@@ -125,7 +123,7 @@ void noekeon_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_k
 void noekeon_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
    _noekeon_ecb_encrypt(pt, ct, key);
-   burn_stack(sizeof(unsigned long) * 5 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 5 + sizeof(int));
 }
 #endif
 
@@ -135,7 +133,7 @@ static void _noekeon_ecb_decrypt(const unsigned char *ct, unsigned char *pt, sym
 void noekeon_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 #endif
 {
-   unsigned long a,b,c,d, temp;
+   ulong32 a,b,c,d, temp;
    int r;
 
    _ARGCHK(key != NULL);
@@ -170,7 +168,7 @@ void noekeon_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_k
 void noekeon_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
    _noekeon_ecb_decrypt(ct, pt, key);
-   burn_stack(sizeof(unsigned long) * 5 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 5 + sizeof(int));
 }
 #endif
 
diff --git a/rc5.c b/rc5.c
index 51ebba5..0572f18 100644
--- a/rc5.c
+++ b/rc5.c
@@ -14,16 +14,27 @@ const struct _cipher_descriptor rc5_desc =
     &rc5_keysize
 };
 
+static const ulong32 stab[50] = {
+0xb7e15163UL, 0x5618cb1cUL, 0xf45044d5UL, 0x9287be8eUL, 0x30bf3847UL, 0xcef6b200UL, 0x6d2e2bb9UL, 0x0b65a572UL,
+0xa99d1f2bUL, 0x47d498e4UL, 0xe60c129dUL, 0x84438c56UL, 0x227b060fUL, 0xc0b27fc8UL, 0x5ee9f981UL, 0xfd21733aUL,
+0x9b58ecf3UL, 0x399066acUL, 0xd7c7e065UL, 0x75ff5a1eUL, 0x1436d3d7UL, 0xb26e4d90UL, 0x50a5c749UL, 0xeedd4102UL,
+0x8d14babbUL, 0x2b4c3474UL, 0xc983ae2dUL, 0x67bb27e6UL, 0x05f2a19fUL, 0xa42a1b58UL, 0x42619511UL, 0xe0990ecaUL,
+0x7ed08883UL, 0x1d08023cUL, 0xbb3f7bf5UL, 0x5976f5aeUL, 0xf7ae6f67UL, 0x95e5e920UL, 0x341d62d9UL, 0xd254dc92UL,
+0x708c564bUL, 0x0ec3d004UL, 0xacfb49bdUL, 0x4b32c376UL, 0xe96a3d2fUL, 0x87a1b6e8UL, 0x25d930a1UL, 0xc410aa5aUL,
+0x62482413UL, 0x007f9dccUL
+};
+
 #ifdef CLEAN_STACK
 static int _rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 #else
 int rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 #endif
 {
-    unsigned long L[64], S[50], A, B, i, j, v, s, t, l;
+    ulong32 L[64], *S, A, B, i, j, v, s, t, l;
 
     _ARGCHK(skey != NULL);
     _ARGCHK(key != NULL);
+    
 
     /* test parameters */
     if (num_rounds == 0) { 
@@ -38,10 +49,13 @@ int rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
     if (keylen < 8 || keylen > 128) {
        return CRYPT_INVALID_KEYSIZE;
     }
+    
+    skey->rc5.rounds = num_rounds;
+    S = skey->rc5.K;
 
     /* copy the key into the L array */
-    for (A = i = j = 0; i < (unsigned long)keylen; ) { 
-        A = (A << 8) | ((unsigned long)(key[i++] & 255));
+    for (A = i = j = 0; i < (ulong32)keylen; ) { 
+        A = (A << 8) | ((ulong32)(key[i++] & 255));
         if ((i & 3) == 0) {
            L[j++] = BSWAP(A);
            A = 0;
@@ -49,14 +63,13 @@ int rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
     }
 
     if ((keylen & 3) != 0) { 
-       A <<= (unsigned long)((8 * (4 - (keylen&3)))); 
+       A <<= (ulong32)((8 * (4 - (keylen&3)))); 
        L[j++] = BSWAP(A);
     }
 
     /* setup the S array */
-    t = (unsigned long)(2 * (num_rounds + 1));
-    S[0] = 0xB7E15163UL;
-    for (i = 1; i < t; i++) S[i] = S[i - 1] + 0x9E3779B9UL;
+    t = (ulong32)(2 * (num_rounds + 1));
+    memcpy(S, stab, t * sizeof(stab[0]));
 
     /* mix buffer */
     s = 3 * MAX(t, j);
@@ -64,15 +77,9 @@ int rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
     for (A = B = i = j = v = 0; v < s; v++) { 
         A = S[i] = ROL(S[i] + A + B, 3);
         B = L[j] = ROL(L[j] + A + B, (A+B));
-        i = (i + 1) % t;
-        j = (j + 1) % l;
+        if (++i == t) { i = 0; }
+        if (++j == l) { j = 0; }
     }
-    
-    /* copy to key */
-    for (i = 0; i < t; i++) {
-        skey->rc5.K[i] = S[i];
-    }
-    skey->rc5.rounds = num_rounds;
     return CRYPT_OK;
 }
 
@@ -81,7 +88,7 @@ int rc5_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
 {
    int x;
    x = _rc5_setup(key, keylen, num_rounds, skey);
-   burn_stack(sizeof(unsigned long) * 122 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 122 + sizeof(int));
    return x;
 }
 #endif
@@ -92,7 +99,7 @@ static void _rc5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetr
 void rc5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 #endif
 {
-   unsigned long A, B, *K;
+   ulong32 A, B, *K;
    int r;
    _ARGCHK(key != NULL);
    _ARGCHK(pt != NULL);
@@ -127,7 +134,7 @@ void rc5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *
 void rc5_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
    _rc5_ecb_encrypt(pt, ct, key);
-   burn_stack(sizeof(unsigned long) * 2 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 2 + sizeof(int));
 }
 #endif
 
@@ -137,7 +144,7 @@ static void _rc5_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetr
 void rc5_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 #endif
 {
-   unsigned long A, B, *K;
+   ulong32 A, B, *K;
    int r;
    _ARGCHK(key != NULL);
    _ARGCHK(pt != NULL);
@@ -173,7 +180,7 @@ void rc5_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *
 void rc5_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
    _rc5_ecb_decrypt(ct, pt, key);
-   burn_stack(sizeof(unsigned long) * 2 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 2 + sizeof(int));
 }
 #endif
 
diff --git a/rc6.c b/rc6.c
index 720bdf3..c39a439 100644
--- a/rc6.c
+++ b/rc6.c
@@ -14,13 +14,21 @@ const struct _cipher_descriptor rc6_desc =
     &rc6_keysize
 };
 
+static const ulong32 stab[44] = {
+0xb7e15163UL, 0x5618cb1cUL, 0xf45044d5UL, 0x9287be8eUL, 0x30bf3847UL, 0xcef6b200UL, 0x6d2e2bb9UL, 0x0b65a572UL,
+0xa99d1f2bUL, 0x47d498e4UL, 0xe60c129dUL, 0x84438c56UL, 0x227b060fUL, 0xc0b27fc8UL, 0x5ee9f981UL, 0xfd21733aUL,
+0x9b58ecf3UL, 0x399066acUL, 0xd7c7e065UL, 0x75ff5a1eUL, 0x1436d3d7UL, 0xb26e4d90UL, 0x50a5c749UL, 0xeedd4102UL,
+0x8d14babbUL, 0x2b4c3474UL, 0xc983ae2dUL, 0x67bb27e6UL, 0x05f2a19fUL, 0xa42a1b58UL, 0x42619511UL, 0xe0990ecaUL,
+0x7ed08883UL, 0x1d08023cUL, 0xbb3f7bf5UL, 0x5976f5aeUL, 0xf7ae6f67UL, 0x95e5e920UL, 0x341d62d9UL, 0xd254dc92UL,
+0x708c564bUL, 0x0ec3d004UL, 0xacfb49bdUL, 0x4b32c376UL };
+
 #ifdef CLEAN_STACK
 static int _rc6_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 #else
 int rc6_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 #endif
 {
-    unsigned long L[64], S[50], A, B, i, j, v, s, t, l;
+    ulong32 L[64], S[50], A, B, i, j, v, s, l;
 
     _ARGCHK(key != NULL);
     _ARGCHK(skey != NULL);
@@ -36,8 +44,8 @@ int rc6_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
     }
 
     /* copy the key into the L array */
-    for (A = i = j = 0; i < (unsigned long)keylen; ) { 
-        A = (A << 8) | ((unsigned long)(key[i++] & 255));
+    for (A = i = j = 0; i < (ulong32)keylen; ) { 
+        A = (A << 8) | ((ulong32)(key[i++] & 255));
         if (!(i & 3)) {
            L[j++] = BSWAP(A);
            A = 0;
@@ -51,23 +59,20 @@ int rc6_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
     }
 
     /* setup the S array */
-    t = 44;                                     /* fixed at 20 rounds */
-    S[0] = 0xB7E15163UL;
-    for (i = 1; i < t; i++) 
-        S[i] = S[i - 1] + 0x9E3779B9UL;
+    memcpy(S, stab, 44 * sizeof(stab[0]));
 
     /* mix buffer */
-    s = 3 * MAX(t, j);
+    s = 3 * MAX(44, j);
     l = j;
     for (A = B = i = j = v = 0; v < s; v++) { 
         A = S[i] = ROL(S[i] + A + B, 3);
         B = L[j] = ROL(L[j] + A + B, (A+B));
-        i = (i + 1) % t;
-        j = (j + 1) % l;
+        if (++i == 44) { i = 0; }
+        if (++j == l)  { j = 0; }
     }
     
     /* copy to key */
-    for (i = 0; i < t; i++) { 
+    for (i = 0; i < 44; i++) { 
         skey->rc6.K[i] = S[i];
     }
     return CRYPT_OK;
@@ -78,7 +83,7 @@ int rc6_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_ke
 {
    int x;
    x = _rc6_setup(key, keylen, num_rounds, skey);
-   burn_stack(sizeof(unsigned long) * 122);
+   burn_stack(sizeof(ulong32) * 122);
    return x;
 }
 #endif
@@ -89,7 +94,7 @@ static void _rc6_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetr
 void rc6_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 #endif
 {
-   unsigned long a,b,c,d,t,u, *K;
+   ulong32 a,b,c,d,t,u, *K;
    int r;
    
    _ARGCHK(key != NULL);
@@ -125,7 +130,7 @@ void rc6_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *
 void rc6_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
    _rc6_ecb_encrypt(pt, ct, key);
-   burn_stack(sizeof(unsigned long) * 6 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 6 + sizeof(int));
 }
 #endif
 
@@ -135,7 +140,7 @@ static void _rc6_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetr
 void rc6_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 #endif
 {
-   unsigned long a,b,c,d,t,u, *K;
+   ulong32 a,b,c,d,t,u, *K;
    int r;
 
    _ARGCHK(key != NULL);
@@ -172,7 +177,7 @@ void rc6_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *
 void rc6_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
    _rc6_ecb_decrypt(ct, pt, key);
-   burn_stack(sizeof(unsigned long) * 6 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 6 + sizeof(int));
 }
 #endif
 
diff --git a/safer+.c b/safer+.c
index d8b1af3..c70a3a2 100644
--- a/safer+.c
+++ b/safer+.c
@@ -190,7 +190,7 @@ static const unsigned char safer_bias[33][16] = {
 
 int saferp_setup(const unsigned char *key, int keylen, int num_rounds, symmetric_key *skey)
 {
-   unsigned x, y;
+   unsigned x, y, z;
    unsigned char t[33];
    static const int rounds[3] = { 8, 12, 16 };
 
@@ -231,8 +231,10 @@ int saferp_setup(const unsigned char *key, int keylen, int num_rounds, symmetric
            }
 
            /* select and add */
+           z = x;
            for (y = 0; y < 16; y++) {
-               skey->saferp.K[x][y] = (t[(x+y)%17] + safer_bias[x-1][y]) & 255;
+               skey->saferp.K[x][y] = (t[z] + safer_bias[x-1][y]) & 255;
+               if (++z == 17) { z = 0; }
            }
        }
        skey->saferp.rounds = 8;
@@ -256,8 +258,10 @@ int saferp_setup(const unsigned char *key, int keylen, int num_rounds, symmetric
            }
 
            /* select and add */
+           z = x;
            for (y = 0; y < 16; y++) { 
-               skey->saferp.K[x][y] = (t[(x+y)%25] + safer_bias[x-1][y]) & 255;
+               skey->saferp.K[x][y] = (t[z] + safer_bias[x-1][y]) & 255;
+               if (++z == 25) { z = 0; }
            }
        }
        skey->saferp.rounds = 12;
@@ -281,8 +285,10 @@ int saferp_setup(const unsigned char *key, int keylen, int num_rounds, symmetric
            }
            
            /* select and add */
+           z = x;
            for (y = 0; y < 16; y++) {
-               skey->saferp.K[x][y] = (t[(x+y)%33] + safer_bias[x-1][y]) & 255;
+               skey->saferp.K[x][y] = (t[z] + safer_bias[x-1][y]) & 255;
+               if (++z == 33) { z = 0; }
            }
        }
        skey->saferp.rounds = 16;
diff --git a/safer.c b/safer.c
index 87c7db9..2eb5f2a 100644
--- a/safer.c
+++ b/safer.c
@@ -91,7 +91,7 @@ static void Safer_Expand_Userkey(const unsigned char *userkey_1,
                                  int strengthened,
                                  safer_key_t key)
 #endif
-{   unsigned int i, j;
+{   unsigned int i, j, k;
     unsigned char ka[SAFER_BLOCK_LEN + 1];
     unsigned char kb[SAFER_BLOCK_LEN + 1];
 
@@ -100,6 +100,7 @@ static void Safer_Expand_Userkey(const unsigned char *userkey_1,
     *key++ = (unsigned char)nof_rounds;
     ka[SAFER_BLOCK_LEN] = (unsigned char)0;
     kb[SAFER_BLOCK_LEN] = (unsigned char)0;
+    k = 0;
     for (j = 0; j < SAFER_BLOCK_LEN; j++) {
         ka[j] = ROL8(userkey_1[j], 5);
         ka[SAFER_BLOCK_LEN] ^= ka[j];
@@ -111,18 +112,28 @@ static void Safer_Expand_Userkey(const unsigned char *userkey_1,
             ka[j] = ROL8(ka[j], 6);
             kb[j] = ROL8(kb[j], 6);
         }
+        if (strengthened) {
+           k = 2 * i - 1;
+           while (k >= (SAFER_BLOCK_LEN + 1)) { k -= SAFER_BLOCK_LEN + 1; }
+        }
         for (j = 0; j < SAFER_BLOCK_LEN; j++) {
             if (strengthened) {
-                *key++ = (ka[(j + 2 * i - 1) % (SAFER_BLOCK_LEN + 1)]
+                *key++ = (ka[k]
                                 + safer_ebox[(int)safer_ebox[(int)((18 * i + j + 1)&0xFF)]]) & 0xFF;
+                if (++k == (SAFER_BLOCK_LEN + 1)) { k = 0; }
             } else {
                 *key++ = (ka[j] + safer_ebox[(int)safer_ebox[(int)((18 * i + j + 1)&0xFF)]]) & 0xFF;
             }
         }
+        if (strengthened) {
+           k = 2 * i;
+           while (k >= (SAFER_BLOCK_LEN + 1)) { k -= SAFER_BLOCK_LEN + 1; }
+        }
         for (j = 0; j < SAFER_BLOCK_LEN; j++) {
             if (strengthened) {
-                *key++ = (kb[(j + 2 * i) % (SAFER_BLOCK_LEN + 1)]
+                *key++ = (kb[k]
                                 + safer_ebox[(int)safer_ebox[(int)((18 * i + j + 10)&0xFF)]]) & 0xFF;
+                if (++k == (SAFER_BLOCK_LEN + 1)) { k = 0; }
             } else {
                 *key++ = (kb[j] + safer_ebox[(int)safer_ebox[(int)((18 * i + j + 10)&0xFF)]]) & 0xFF;
             }
diff --git a/sha1.c b/sha1.c
index 58975ca..2489f34 100644
--- a/sha1.c
+++ b/sha1.c
@@ -25,7 +25,7 @@ static void _sha1_compress(hash_state *md)
 static void sha1_compress(hash_state *md)
 #endif
 {
-    unsigned long a,b,c,d,e,W[80],i,j;
+    unsigned long a,b,c,d,e,W[80],i;
 
     _ARGCHK(md != NULL);
 
@@ -43,52 +43,57 @@ static void sha1_compress(hash_state *md)
 
     /* expand it */
     for (i = 16; i < 80; i++) {
-        j = W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16]; 
-        W[i] = ROL(j, 1);
+        W[i] = ROL(W[i-3] ^ W[i-8] ^ W[i-14] ^ W[i-16], 1); 
     }
 
 
     /* compress */
     /* round one */
-    for (i = 0;  i < 20; i++)  { 
-        j = (ROL(a, 5) + F0(b,c,d) + e + W[i] + 0x5a827999UL); 
-        e = d; 
-        d = c; 
-        c = ROL(b, 30); 
-        b = a; 
-        a = j; 
+    #define FF0(a,b,c,d,e,i) e = (ROL(a, 5) + F0(b,c,d) + e + W[i] + 0x5a827999UL); b = ROL(b, 30);
+    #define FF1(a,b,c,d,e,i) e = (ROL(a, 5) + F1(b,c,d) + e + W[i] + 0x6ed9eba1UL); b = ROL(b, 30);
+    #define FF2(a,b,c,d,e,i) e = (ROL(a, 5) + F2(b,c,d) + e + W[i] + 0x8f1bbcdcUL); b = ROL(b, 30);
+    #define FF3(a,b,c,d,e,i) e = (ROL(a, 5) + F3(b,c,d) + e + W[i] + 0xca62c1d6UL); b = ROL(b, 30);
+ 
+    for (i = 0; i < 20; ) {
+       FF0(a,b,c,d,e,i++);
+       FF0(e,a,b,c,d,i++);
+       FF0(d,e,a,b,c,i++);
+       FF0(c,d,e,a,b,i++);
+       FF0(b,c,d,e,a,i++);
     }
 
     /* round two */
-    for (i = 20; i < 40; i++)  { 
-        j = (ROL(a, 5) + F1(b,c,d) + e + W[i] + 0x6ed9eba1UL); 
-        e = d; 
-        d = c; 
-        c = ROL(b, 30); 
-        b = a; 
-        a = j; 
+    for (i = 20; i < 40; )  { 
+       FF1(a,b,c,d,e,i++);
+       FF1(e,a,b,c,d,i++);
+       FF1(d,e,a,b,c,i++);
+       FF1(c,d,e,a,b,i++);
+       FF1(b,c,d,e,a,i++);
     }
 
     /* round three */
-    for (i = 40; i < 60; i++)  { 
-        j = (ROL(a, 5) + F2(b,c,d) + e + W[i] + 0x8f1bbcdcUL); 
-        e = d; 
-        d = c; 
-        c = ROL(b, 30); 
-        b = a; 
-        a = j; 
+    for (i = 40; i < 60; )  { 
+       FF2(a,b,c,d,e,i++);
+       FF2(e,a,b,c,d,i++);
+       FF2(d,e,a,b,c,i++);
+       FF2(c,d,e,a,b,i++);
+       FF2(b,c,d,e,a,i++);
     }
 
     /* round four */
-    for (i = 60; i < 80; i++)  { 
-        j = (ROL(a, 5) + F3(b,c,d) + e + W[i] + 0xca62c1d6UL);
-        e = d;
-        d = c;
-        c = ROL(b, 30);
-        b = a;
-        a = j;
+    for (i = 60; i < 80; )  { 
+       FF3(a,b,c,d,e,i++);
+       FF3(e,a,b,c,d,i++);
+       FF3(d,e,a,b,c,i++);
+       FF3(c,d,e,a,b,i++);
+       FF3(b,c,d,e,a,i++);
     }
 
+    #undef FF0
+    #undef FF1
+    #undef FF2
+    #undef FF3
+
     /* store */
     md->sha1.state[0] = md->sha1.state[0] + a;
     md->sha1.state[1] = md->sha1.state[1] + b;
diff --git a/sha256.c b/sha256.c
index f14e72f..2402b0c 100644
--- a/sha256.c
+++ b/sha256.c
@@ -54,8 +54,9 @@ static void sha256_compress(hash_state * md)
     _ARGCHK(md != NULL);
 
     /* copy state into S */
-    for (i = 0; i < 8; i++)
+    for (i = 0; i < 8; i++) {
         S[i] = md->sha256.state[i];
+    }
 
     /* copy the state into 512-bits into W[0..15] */
     for (i = 0; i < 16; i++) {
@@ -68,6 +69,7 @@ static void sha256_compress(hash_state * md)
     }        
 
     /* Compress */
+#ifdef SMALL_CODE   
     for (i = 0; i < 64; i++) {
         t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K[i] + W[i];
         t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
@@ -80,6 +82,81 @@ static void sha256_compress(hash_state * md)
         S[1] = S[0];
         S[0] = t0 + t1;
     }
+#else 
+#define RND(a,b,c,d,e,f,g,h,i,ki)                    \
+     t0 = h + Sigma1(e) + Ch(e, f, g) + ki + W[i];   \
+     t1 = Sigma0(a) + Maj(a, b, c);                  \
+     d += t0;                                        \
+     h  = t0 + t1;
+
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],0,0x428a2f98);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],1,0x71374491);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],2,0xb5c0fbcf);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],3,0xe9b5dba5);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],4,0x3956c25b);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],5,0x59f111f1);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],6,0x923f82a4);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],7,0xab1c5ed5);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],8,0xd807aa98);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],9,0x12835b01);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],10,0x243185be);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],11,0x550c7dc3);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],12,0x72be5d74);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],13,0x80deb1fe);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],14,0x9bdc06a7);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],15,0xc19bf174);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],16,0xe49b69c1);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],17,0xefbe4786);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],18,0x0fc19dc6);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],19,0x240ca1cc);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],20,0x2de92c6f);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],21,0x4a7484aa);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],22,0x5cb0a9dc);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],23,0x76f988da);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],24,0x983e5152);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],25,0xa831c66d);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],26,0xb00327c8);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],27,0xbf597fc7);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],28,0xc6e00bf3);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],29,0xd5a79147);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],30,0x06ca6351);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],31,0x14292967);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],32,0x27b70a85);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],33,0x2e1b2138);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],34,0x4d2c6dfc);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],35,0x53380d13);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],36,0x650a7354);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],37,0x766a0abb);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],38,0x81c2c92e);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],39,0x92722c85);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],40,0xa2bfe8a1);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],41,0xa81a664b);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],42,0xc24b8b70);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],43,0xc76c51a3);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],44,0xd192e819);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],45,0xd6990624);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],46,0xf40e3585);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],47,0x106aa070);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],48,0x19a4c116);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],49,0x1e376c08);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],50,0x2748774c);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],51,0x34b0bcb5);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],52,0x391c0cb3);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],53,0x4ed8aa4a);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],54,0x5b9cca4f);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],55,0x682e6ff3);
+    RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],56,0x748f82ee);
+    RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],57,0x78a5636f);
+    RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],58,0x84c87814);
+    RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],59,0x8cc70208);
+    RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],60,0x90befffa);
+    RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],61,0xa4506ceb);
+    RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],62,0xbef9a3f7);
+    RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],63,0xc67178f2);
+
+#undef RND     
+    
+#endif     
 
     /* feedback */
     for (i = 0; i < 8; i++) {
diff --git a/sha512.c b/sha512.c
index c52e393..b63efdb 100644
--- a/sha512.c
+++ b/sha512.c
@@ -95,6 +95,7 @@ static void sha512_compress(hash_state * md)
     }        
 
     /* Compress */
+#ifdef SMALL_CODE
     for (i = 0; i < 80; i++) {
         t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K[i] + W[i];
         t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
@@ -107,6 +108,25 @@ static void sha512_compress(hash_state * md)
         S[1] = S[0];
         S[0] = t0 + t1;
     }
+#else
+#define RND(a,b,c,d,e,f,g,h,i)                    \
+     t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i];   \
+     t1 = Sigma0(a) + Maj(a, b, c);                  \
+     d += t0;                                        \
+     h  = t0 + t1;
+
+     for (i = 0; i < 80; i += 8) {
+         RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
+         RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
+         RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
+         RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
+         RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
+         RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
+         RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
+         RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
+     }
+#endif     
+
 
     /* feedback */
     for (i = 0; i < 8; i++) {
diff --git a/tommath.h b/tommath.h
index 8e43f6c..7c3158f 100644
--- a/tommath.h
+++ b/tommath.h
@@ -85,6 +85,7 @@ extern "C" {
    #define DIGIT_BIT          31
 #else
    #define DIGIT_BIT          28
+   #define MP_28BIT
 #endif   
 #endif
 
@@ -120,11 +121,21 @@ extern int KARATSUBA_MUL_CUTOFF,
            TOOM_SQR_CUTOFF;
 
 /* various build options */
-#define MP_PREC                 64      /* default digits of precision (must be power of two) */
+#define MP_PREC                 64     /* default digits of precision (must be power of two) */
 
 /* define this to use lower memory usage routines (exptmods mostly) */
 /* #define MP_LOW_MEM */
 
+/* have no cpu based mult?  */
+/* #define SLOW_MULT */
+
+#ifdef SLOW_MULT
+   #define MULT(x, y) s_mp_mult((x), (y))
+   mp_word s_mp_mult(mp_digit, mp_digit);
+#else
+   #define MULT(x, y) (((mp_word)(x)) * ((mp_word)(y)))
+#endif
+
 /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
 #define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
 
@@ -166,7 +177,7 @@ int mp_init_size(mp_int *a, int size);
 /* ---> Basic Manipulations <--- */
 
 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
-#define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
+#define mp_iseven(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 0)) ? 1 : 0)
 #define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
 
 /* set to zero */
@@ -213,6 +224,9 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c);
 /* computes a = 2**b */
 int mp_2expt(mp_int *a, int b);
 
+/* Counts the number of lsbs which are zero before the first zero bit */
+int mp_cnt_lsb(mp_int *a);
+
 /* makes a pseudo-random int of a given size */
 int mp_rand(mp_int *a, int digits);
 
@@ -451,6 +465,8 @@ int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
 void bn_reverse(unsigned char *s, int len);
 
+extern const char *mp_s_rmap;
+
 #ifdef __cplusplus
    }
 #endif
diff --git a/twofish.c b/twofish.c
index eb2ea93..2e1e937 100644
--- a/twofish.c
+++ b/twofish.c
@@ -101,7 +101,7 @@ static const unsigned char SBOX[2][256] = {
  0x86, 0x56, 0x55, 0x09, 0xbe, 0x91}
 };
 
-static const unsigned long mds_tab[4][256] = {
+static const ulong32 mds_tab[4][256] = {
 {
 0x00000000UL, 0xefef5b01UL, 0xb7b7b602UL, 0x5858ed03UL, 0x07070504UL, 0xe8e85e05UL, 0xb0b0b306UL, 0x5f5fe807UL, 
 0x0e0e0a08UL, 0xe1e15109UL, 0xb9b9bc0aUL, 0x5656e70bUL, 0x09090f0cUL, 0xe6e6540dUL, 0xbebeb90eUL, 0x5151e20fUL, 
@@ -239,7 +239,7 @@ static const unsigned long mds_tab[4][256] = {
 0xc6baf8c6UL, 0x9d55f99dUL, 0x700dfa70UL, 0x2be2fb2bUL, 0xc3bdfcc3UL, 0x9852fd98UL, 0x750afe75UL, 0x2ee5ff2eUL
 }};
 
-#define sbox(i, x) ((unsigned long)SBOX[i][(x)&255])
+#define sbox(i, x) ((ulong32)SBOX[i][(x)&255])
 
 #else
 
@@ -261,9 +261,9 @@ static const unsigned char qbox[2][4][16] = {
 
 /* computes S_i[x] */
 #ifdef CLEAN_STACK
-static unsigned long _sbox(int i, unsigned long x)
+static ulong32 _sbox(int i, ulong32 x)
 #else
-static unsigned long sbox(int i, unsigned long x)
+static ulong32 sbox(int i, ulong32 x)
 #endif
 {
    unsigned char a0,b0,a1,b1,a2,b2,a3,b3,a4,b4,y;
@@ -296,13 +296,13 @@ static unsigned long sbox(int i, unsigned long x)
    y = (b4 << 4) + a4;
 
    /* return result */
-   return (unsigned long)y;
+   return (ulong32)y;
 }
 
 #ifdef CLEAN_STACK
-static unsigned long sbox(int i, unsigned long x)
+static ulong32 sbox(int i, ulong32 x)
 {
-   unsigned long y;
+   ulong32 y;
    y = _sbox(i, x);
    burn_stack(sizeof(unsigned char) * 11);
    return y;
@@ -312,22 +312,22 @@ static unsigned long sbox(int i, unsigned long x)
 #endif /* TWOFISH_TABLES */
 
 /* computes ab mod p */
-static unsigned long gf_mult(unsigned long a, unsigned long b, unsigned long p)
+static ulong32 gf_mult(ulong32 a, ulong32 b, ulong32 p)
 {
-   unsigned long result = 0, B[2], P[2];
+   ulong32 result = 0, B[2], P[2];
    
    P[1] = p;
    B[1] = b;
    P[0] = B[0] = 0;  
    
    /* unrolled branchless GF multiplier */
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
-   result ^= B[a&1]; a >>= 1;  B[1] <<= 1; B[1] ^= P[B[1]>>8];
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
+   result ^= B[a&1]; a >>= 1;  B[1] = P[B[1]>>7] ^ (B[1] << 1); 
    result ^= B[a&1]; 
 
    return result;
@@ -335,9 +335,9 @@ static unsigned long gf_mult(unsigned long a, unsigned long b, unsigned long p)
 
 /* computes [y0 y1 y2 y3] = MDS . [x0] */
 #ifndef TWOFISH_TABLES
-static unsigned long mds_column_mult(unsigned char in, int col)
+static ulong32 mds_column_mult(unsigned char in, int col)
 {
-   unsigned long x01, x5B, xEF;
+   ulong32 x01, x5B, xEF;
 
    x01 = in;
    x5B = gf_mult(in, 0x5B, MDS_POLY);
@@ -379,7 +379,7 @@ static unsigned long mds_column_mult(unsigned char in, int col)
 static void mds_mult(const unsigned char *in, unsigned char *out)
 {
   int x;
-  unsigned long tmp;
+  ulong32 tmp;
   for (tmp = x = 0; x < 4; x++) {
       tmp ^= mds_column_mult(in[x], x);
   }
@@ -407,20 +407,20 @@ static void h_func(const unsigned char *in, unsigned char *out, unsigned char *M
 
   switch (k) {
      case 4:
-            y[0] = (unsigned char)(sbox(1, (unsigned long)y[0]) ^ M[4 * (6 + offset) + 0]);
-            y[1] = (unsigned char)(sbox(0, (unsigned long)y[1]) ^ M[4 * (6 + offset) + 1]);
-            y[2] = (unsigned char)(sbox(0, (unsigned long)y[2]) ^ M[4 * (6 + offset) + 2]);
-            y[3] = (unsigned char)(sbox(1, (unsigned long)y[3]) ^ M[4 * (6 + offset) + 3]);
+            y[0] = (unsigned char)(sbox(1, (ulong32)y[0]) ^ M[4 * (6 + offset) + 0]);
+            y[1] = (unsigned char)(sbox(0, (ulong32)y[1]) ^ M[4 * (6 + offset) + 1]);
+            y[2] = (unsigned char)(sbox(0, (ulong32)y[2]) ^ M[4 * (6 + offset) + 2]);
+            y[3] = (unsigned char)(sbox(1, (ulong32)y[3]) ^ M[4 * (6 + offset) + 3]);
      case 3:
-            y[0] = (unsigned char)(sbox(1, (unsigned long)y[0]) ^ M[4 * (4 + offset) + 0]);
-            y[1] = (unsigned char)(sbox(1, (unsigned long)y[1]) ^ M[4 * (4 + offset) + 1]);
-            y[2] = (unsigned char)(sbox(0, (unsigned long)y[2]) ^ M[4 * (4 + offset) + 2]);
-            y[3] = (unsigned char)(sbox(0, (unsigned long)y[3]) ^ M[4 * (4 + offset) + 3]);
+            y[0] = (unsigned char)(sbox(1, (ulong32)y[0]) ^ M[4 * (4 + offset) + 0]);
+            y[1] = (unsigned char)(sbox(1, (ulong32)y[1]) ^ M[4 * (4 + offset) + 1]);
+            y[2] = (unsigned char)(sbox(0, (ulong32)y[2]) ^ M[4 * (4 + offset) + 2]);
+            y[3] = (unsigned char)(sbox(0, (ulong32)y[3]) ^ M[4 * (4 + offset) + 3]);
      case 2:
-            y[0] = (unsigned char)(sbox(1, sbox(0, sbox(0, (unsigned long)y[0]) ^ M[4 * (2 + offset) + 0]) ^ M[4 * (0 + offset) + 0]));
-            y[1] = (unsigned char)(sbox(0, sbox(0, sbox(1, (unsigned long)y[1]) ^ M[4 * (2 + offset) + 1]) ^ M[4 * (0 + offset) + 1]));
-            y[2] = (unsigned char)(sbox(1, sbox(1, sbox(0, (unsigned long)y[2]) ^ M[4 * (2 + offset) + 2]) ^ M[4 * (0 + offset) + 2]));
-            y[3] = (unsigned char)(sbox(0, sbox(1, sbox(1, (unsigned long)y[3]) ^ M[4 * (2 + offset) + 3]) ^ M[4 * (0 + offset) + 3]));
+            y[0] = (unsigned char)(sbox(1, sbox(0, sbox(0, (ulong32)y[0]) ^ M[4 * (2 + offset) + 0]) ^ M[4 * (0 + offset) + 0]));
+            y[1] = (unsigned char)(sbox(0, sbox(0, sbox(1, (ulong32)y[1]) ^ M[4 * (2 + offset) + 1]) ^ M[4 * (0 + offset) + 1]));
+            y[2] = (unsigned char)(sbox(1, sbox(1, sbox(0, (ulong32)y[2]) ^ M[4 * (2 + offset) + 2]) ^ M[4 * (0 + offset) + 2]));
+            y[3] = (unsigned char)(sbox(0, sbox(1, sbox(1, (ulong32)y[3]) ^ M[4 * (2 + offset) + 3]) ^ M[4 * (0 + offset) + 3]));
   }
   mds_mult(y, out);
 }
@@ -442,13 +442,13 @@ static void h_func(const unsigned char *in, unsigned char *out, unsigned char *M
 #else
 
 #ifdef CLEAN_STACK
-static unsigned long _g_func(unsigned long x, symmetric_key *key)
+static ulong32 _g_func(ulong32 x, symmetric_key *key)
 #else
-static unsigned long g_func(unsigned long x, symmetric_key *key)
+static ulong32 g_func(ulong32 x, symmetric_key *key)
 #endif
 {
    unsigned char g, i, y, z;
-   unsigned long res;
+   ulong32 res;
 
    res = 0;
    for (y = 0; y < 4; y++) {
@@ -475,11 +475,11 @@ static unsigned long g_func(unsigned long x, symmetric_key *key)
 #define g1_func(x, key) g_func(ROL(x, 8), key)
 
 #ifdef CLEAN_STACK
-static unsigned long g_func(unsigned long x, symmetric_key *key)
+static ulong32 g_func(ulong32 x, symmetric_key *key)
 {
-    unsigned long y;
+    ulong32 y;
     y = _g_func(x, key);
-    burn_stack(sizeof(unsigned char) * 4 + sizeof(unsigned long));
+    burn_stack(sizeof(unsigned char) * 4 + sizeof(ulong32));
     return y;
 }
 #endif /* CLEAN_STACK */
@@ -493,13 +493,13 @@ int twofish_setup(const unsigned char *key, int keylen, int num_rounds, symmetri
 #endif
 {
 #ifndef TWOFISH_SMALL
-   unsigned long g;
+   ulong32 g;
    int z, i;
    unsigned char S[4*4];
 #endif
    int k, x, y, start;
    unsigned char tmp[4], tmp2[4], M[8*4];
-   unsigned long A, B;
+   ulong32 A, B;
 
    _ARGCHK(key != NULL);
    _ARGCHK(skey != NULL);
@@ -591,7 +591,7 @@ int twofish_setup(const unsigned char *key, int keylen, int num_rounds, symmetri
 {
    int x;
    x = _twofish_setup(key, keylen, num_rounds, skey);
-   burn_stack(sizeof(int) * 7 + sizeof(unsigned char) * 56 + sizeof(unsigned long) * 2);
+   burn_stack(sizeof(int) * 7 + sizeof(unsigned char) * 56 + sizeof(ulong32) * 2);
    return x;
 }
 #endif
@@ -602,10 +602,10 @@ static void _twofish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, sym
 void twofish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 #endif
 {
-    unsigned long a,b,c,d,ta,tb,tc,td,t1,t2, *k;
+    ulong32 a,b,c,d,ta,tb,tc,td,t1,t2, *k;
     int r;
 #if !defined(TWOFISH_SMALL) && !defined(__GNUC__)
-    unsigned long *S1, *S2, *S3, *S4;
+    ulong32 *S1, *S2, *S3, *S4;
 #endif    
 
     _ARGCHK(pt != NULL);
@@ -656,7 +656,7 @@ void twofish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_k
 void twofish_ecb_encrypt(const unsigned char *pt, unsigned char *ct, symmetric_key *key)
 {
    _twofish_ecb_encrypt(pt, ct, key);
-   burn_stack(sizeof(unsigned long) * 10 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 10 + sizeof(int));
 }
 #endif
 
@@ -666,10 +666,10 @@ static void _twofish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, sym
 void twofish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 #endif
 {
-    unsigned long a,b,c,d,ta,tb,tc,td,t1,t2, *k;
+    ulong32 a,b,c,d,ta,tb,tc,td,t1,t2, *k;
     int r;
 #if !defined(TWOFISH_SMALL) && !defined(__GNUC__)
-    unsigned long *S1, *S2, *S3, *S4;
+    ulong32 *S1, *S2, *S3, *S4;
 #endif    
 
     _ARGCHK(pt != NULL);
@@ -723,7 +723,7 @@ void twofish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_k
 void twofish_ecb_decrypt(const unsigned char *ct, unsigned char *pt, symmetric_key *key)
 {
    _twofish_ecb_decrypt(ct, pt, key);
-   burn_stack(sizeof(unsigned long) * 10 + sizeof(int));
+   burn_stack(sizeof(ulong32) * 10 + sizeof(int));
 }
 #endif