added libtommath-0.11

2003-02-28 16:07:58 +00:00 · 2003-02-28 16:07:58 +00:00 · 33c5019985
commit 33c5019985
parent fb93a30a25
9 changed files with 885 additions and 828 deletions
--- a/b.bat
+++ b/b.bat
@ -1,3 +1,2 @@
-nasm -f coff timer.asm
-gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
-rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
+nasm -f elf timer.asm
+gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
--- a/bn.c
+++ b/bn.c
@ -99,7 +99,8 @@ void dump_timings(void)
   memset(&functime, 0, sizeof(functime));
   total = 0;
   for (x = 0; x < _itims; x++) {
-       total += timings[x].tot;
+       if (strcmp(timings[x].func, "_verify")) 
+          total += timings[x].tot;
       
       /* try to find this entry */
       for (y = 0; functime[y].func != NULL; y++) {
@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
   
   /* clear unused */
-   for (ix = c->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
      c->dp[ix] = 0;
   }
  
@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   c->used = newused;
   
   /* now convert the array W downto what we need */
-   for (ix = digs+1; ix < (pa+pb+1); ix++) {
+   for (ix = digs+1; ix < newused; ix++) {
       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
   c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
   
-   for (ix = c->used; ix < oldused; ix++) {
+   for (; ix < oldused; ix++) {
      c->dp[ix] = 0;
   }
   mp_clamp(c);
@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
   b->used = newused;
   
   /* now compute digits */
-   for (ix = 1; ix < (pa+pa+1); ix++) {
+   for (ix = 1; ix < newused; ix++) {
       /* double/add next digit */
       W[ix]       += W[ix] + W2[ix];

       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
       b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
-   b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
+   b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
   
   /* clear high */
-   for (ix = b->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
       b->dp[ix] = 0;
   }
   
@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
   }
   
   mp_clamp(&x0);
-   mp_clamp(&x1);
   mp_clamp(&y0);
-   mp_clamp(&y1);
   
   /* now calc the products x0y0 and x1y1 */
   if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */
@ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
   x1.used = a->used - B;
   
   mp_clamp(&x0);
-   mp_clamp(&x1);
   
   /* now calc the products x0*x0 and x1*x1 */
-   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
-   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
+   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                  /* x0x0 = x0*x0 */
+   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                  /* x1x1 = x1*x1 */

   /* now calc x1-x0 and y1-y0 */
   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
-   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
+   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                    /* t1 = (x1 - x0) * (y1 - y0) */

   /* add x0y0 */
   if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
   VERIFY(a);
   VERIFY(b);
   
-   mp_set(a, 1);
-   if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
+   if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
      DECFUNC();
      return res;
   }
@ -2876,7 +2873,6 @@ __T:  mp_clear(&t);
   return res;
 }   

-
 /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
 static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 {
@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
   mp_digit ui;
   mp_word  W[512];
   
+   REGFUNC("fast_mp_montgomery_reduce");
+   VERIFY(a);
+   VERIFY(m);
+   
   /* get old used count */
   olduse = a->used;
   
   /* grow a as required */
-   if (a->alloc < m->used*2+1) {
-      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
+   if (a->alloc < m->used+1) {
+      if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
+         DECFUNC();
         return res;
      }
   }
   
-   /* copy and clear */
+   /* copy the digits of a */
   for (ix = 0; ix < a->used; ix++) {
       W[ix] = a->dp[ix];
   }
+   
+   /* zero the high words */
   for (; ix < m->used * 2 + 1; ix++) {
       W[ix] = 0;
   }
-   
+     
   for (ix = 0; ix < m->used; ix++) {
-       /* ui = ai * m' mod b */
+       /* ui = ai * m' mod b 
+        *
+        * We avoid a double precision multiplication (which isn't required)
+        * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+        * the carry cleared (see after the inner loop)
+        */
       ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
       
-       /* a = a + ui * m * b^i */
+       /* a = a + ui * m * b^i 
+        *
+        * This is computed in place and on the fly.  The multiplication 
+        * by b^i is handled by offseting which columns the results 
+        * are added to.
+        *
+        * Note the comba method normally doesn't handle carries in the inner loop
+        * In this case we fix the carry from the previous column since the Montgomery
+        * reduction requires digits of the result (so far) [see above] to work.  This is 
+        * handled by fixing up one carry after the inner loop.  The carry fixups are done
+        * in order so after these loops the first m->used words of W[] have the carries
+        * fixed
+        */       
       { 
          register int      iy;
          register mp_digit *tmpx;
@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
          tmpx = m->dp;
          _W   = W + ix;
          
+          /* inner loop */
          for (iy = 0; iy < m->used; iy++) {
              *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
          }
-          
-          /* now fix carry for W[ix+1] */
-          W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
-          W[ix]   &= ((mp_word)MP_MASK);
       }
+
+       /* now fix carry for next digit, W[ix+1] */
+       W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
   }
   
   /* nox fix rest of carries */
-   for (; ix <= m->used * 2 + 1; ix++) {
+   for (++ix; ix <= m->used * 2 + 1; ix++) {
       W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       W[ix-1] &= ((mp_word)MP_MASK);
   }
   
-   /* copy out */
-
-   /* A = A/b^n */
+   /* copy out, A = A/b^n 
+    *
+    * The result is A/b^n but instead of converting from an array of mp_word
+    * to mp_digit than calling mp_rshd we just copy them in the right
+    * order 
+    */
   for (ix = 0; ix < m->used + 1; ix++) { 
-       a->dp[ix] = W[ix+m->used];
+       a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
   }
   
+   /* set the max used */
   a->used = m->used + 1;

-   /* zero oldused digits */  
+   /* zero oldused digits, if the input a was larger than 
+    * m->used+1 we'll have to clear the digits */  
   for (; ix < olduse; ix++) {
       a->dp[ix] = 0;
   }
@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
   /* if A >= m then A = A - m */
   if (mp_cmp_mag(a, m) != MP_LT) {
      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
+         DECFUNC();
         return res;
      }
-   }
+   }   
   
+   DECFUNC();
   return MP_OKAY;
 }

@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 */
 static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res;
+   mp_int M[256], res;
   mp_digit buf, mp;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@ -3048,12 +3074,14 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   
   /* find window size */
   x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
-   
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }
+
   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
@ -3072,15 +3100,14 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   
   /* setup result */
   if ((err = mp_init(&res)) != MP_OKAY) {
-      goto __M;
+      goto __RES;
   }

   /* now we need R mod m */
-   mp_set(&res, 1);           
-   if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
+   if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
      goto __RES;
   }
-   
+      
   /* res = R mod m */
   if ((err = mp_mod(&res, P, &res)) != MP_OKAY) {
      goto __RES;
@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
-   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __RES;
   }
@ -3101,7 +3127,7 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) {
      goto __RES;
   }
-   
+      
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
      goto __RES;
@ -3236,10 +3262,9 @@ __M  :
   return err;
 }

-
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res, mu;
+   mp_int M[256], res, mu;
   mp_digit buf;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)

   /* find window size */
   x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }
   
   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
-   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __MU;
   }
@ -3430,6 +3456,22 @@ __M  :
   return err;
 }

+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b)
+{
+   int res;
+   
+   mp_zero(a);
+   if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
+      return res;
+   }
+   a->used = b/DIGIT_BIT + 1;
+   a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+   
+   return MP_OKAY;
+}   
+   
+
 /* find the n'th root of an integer 
 *
 * Result found such that (c)^b <= a and (c+1)^b > a 
--- a/bn.h
+++ b/bn.h
@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
 /* c = a mod 2^d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);

+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b);
+
 /* ---> Basic arithmetic <--- */

 /* b = -a */
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
--- a/changes.txt
+++ b/changes.txt
@ -1,70 +1,76 @@
-Jan 9th, 2003
-v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
-       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
-          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
-       
-Jan 6th, 2003
-v0.09  -- Updated the manual to reflect recent changes.  :-)
-       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
-       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
-
-Jan 2nd, 2003
-v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
-       -- Corrected a bunch of small "warnings"
-       -- Added more comments
-       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
-       -- Corrected some bugs where error messages were potentially ignored
-       -- add etc/pprime.c program which makes numbers which are provably prime.
-       
-Jan 1st, 2003
-v0.07  -- Removed alot of heap operations from core functions to speed them up
-       -- Added a root finding function [and mp_sqrt macro like from MPI]
-       -- Added more to manual 
-
-Dec 31st, 2002
-v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
-       -- Cleaned up the header a bit more
-       
-Dec 30th, 2002
-v0.05  -- Builds with MSVC out of the box
-       -- Fixed a bug in mp_invmod w.r.t. even moduli
-       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
-       -- Fixed up exptmod to use fewer multiplications
-       -- Fixed up mp_init_size to use only one heap operation
-          -- Note there is a slight "off-by-one" bug in the library somewhere
-             without the padding (see the source for comment) the library 
-             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
-             numbers which will always correct it since as the numbers grow the padding
-             will still be beyond the end of the number
-       -- Added more to the manual
-       
-Dec 29th, 2002
-v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
-       -- optimized invmod code
-       -- Fixed bug in mp_div
-       -- use exchange instead of copy for results
-       -- added a bit more to the manual
-       
-Dec 27th, 2002
-v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
-       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
-       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
-       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
-       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
-       -- mp_mul_d didn't preserve sign
-       -- Many many many many fixes
-       -- Works in LibTomCrypt now :-)
-       -- Added iterations to the timing demos... more accurate.
-       -- Tom needs a job.       
-
-Dec 26th, 2002
-v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
-       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
-       -- Sped up the fast [comba] multipliers more [yahoo!]
-
-Dec 25th,2002
-v0.01  -- Initial release.  Gimme a break.
-       -- Todo list, 
-           add details to manual [e.g. algorithms]
-           more comments in code
-           example programs
+Jan 15th, 2003
+v0.11  -- More subtle fixes
+       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
+       -- Sped up the montgomery reduction code quite a bit
+       -- fixed up demo so when building timing for the x86 it assumes ELF format now
+       
+Jan 9th, 2003
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
+       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
+          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
+       
+Jan 6th, 2003
+v0.09  -- Updated the manual to reflect recent changes.  :-)
+       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
+       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
+
+Jan 2nd, 2003
+v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
+       -- Corrected a bunch of small "warnings"
+       -- Added more comments
+       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
+       -- Corrected some bugs where error messages were potentially ignored
+       -- add etc/pprime.c program which makes numbers which are provably prime.
+       
+Jan 1st, 2003
+v0.07  -- Removed alot of heap operations from core functions to speed them up
+       -- Added a root finding function [and mp_sqrt macro like from MPI]
+       -- Added more to manual 
+
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+       
+Dec 30th, 2002
+v0.05  -- Builds with MSVC out of the box
+       -- Fixed a bug in mp_invmod w.r.t. even moduli
+       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
+       -- Fixed up exptmod to use fewer multiplications
+       -- Fixed up mp_init_size to use only one heap operation
+          -- Note there is a slight "off-by-one" bug in the library somewhere
+             without the padding (see the source for comment) the library 
+             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
+             numbers which will always correct it since as the numbers grow the padding
+             will still be beyond the end of the number
+       -- Added more to the manual
+       
+Dec 29th, 2002
+v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
+       -- optimized invmod code
+       -- Fixed bug in mp_div
+       -- use exchange instead of copy for results
+       -- added a bit more to the manual
+       
+Dec 27th, 2002
+v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
+       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
+       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
+       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
+       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
+       -- mp_mul_d didn't preserve sign
+       -- Many many many many fixes
+       -- Works in LibTomCrypt now :-)
+       -- Added iterations to the timing demos... more accurate.
+       -- Tom needs a job.       
+
+Dec 26th, 2002
+v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
+       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
+       -- Sped up the fast [comba] multipliers more [yahoo!]
+
+Dec 25th,2002
+v0.01  -- Initial release.  Gimme a break.
+       -- Todo list, 
+           add details to manual [e.g. algorithms]
+           more comments in code
+           example programs
--- a/demo.c
+++ b/demo.c
@ -19,8 +19,10 @@

 #ifdef TIMER_X86
 #define TIMER
-extern ulong64 rdtsc(void);
-extern void reset(void);
+extern ulong64 _rdtsc(void);
+extern void _reset(void);
+ulong64 rdtsc(void) { return _rdtsc(); }
+void reset(void) { _reset(); }
 #endif

 #ifdef TIMER
@ -85,7 +87,6 @@ int main(void)
   mp_int a, b, c, d, e, f;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
   int rr;
-   mp_digit tom;
   
 #ifdef TIMER
   int n;
@ -99,42 +100,33 @@ int main(void)
   mp_init(&e);
   mp_init(&f);
   
-   mp_read_radix(&a, "59994534535345535344389423", 10);
-   mp_read_radix(&b, "49993453555234234565675534", 10);
-   mp_read_radix(&c, "62398923474472948723847281", 10);
-    
-   mp_mulmod(&a, &b, &c, &f);
-   
-   /* setup mont */
-   mp_montgomery_setup(&c, &tom);
-   mp_mul(&a, &b, &a);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_lshd(&a, c.used*2);
-   mp_mod(&a, &c, &a);
-   
-   mp_toradix(&a, cmd, 10);
-   printf("%s\n\n", cmd);
-   mp_toradix(&f, cmd, 10);
-   printf("%s\n", cmd);
-   
-/*   return 0; */
-   
-   
-   mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
-   mp_reduce_setup(&b, &a);
-   printf("\n\n----\n\n");
-   mp_toradix(&b, buf, 10);
-   printf("b == %s\n\n\n", buf);
-
-   mp_read_radix(&b, "4982748972349724892742", 10);
-   mp_sub_d(&a, 1, &c);
-   mp_exptmod(&b, &c, &a, &d);
-   mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf);
-   
+#ifdef DEBUG
+   mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
+   mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
+   mp_set(&c, 1);
+   reset_timings();
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   dump_timings();
+   return 0;
+#endif   
      
 #ifdef TIMER      
+goto expt;
      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
      while (a.used * DIGIT_BIT < 8192) {
@ -182,7 +174,7 @@ int main(void)
      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
      mp_copy(&b, &a);
   }
-
+expt:
   {
      char *primes[] = {
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -206,7 +198,7 @@ int main(void)
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
      reset();
-      for (rr = 0; rr < 35; rr++) {
+      for (rr = 0; rr < 100; rr++) {
          mp_exptmod(&c, &b, &a, &d);
      }
      tt = rdtsc();
@ -219,7 +211,7 @@ int main(void)
         draw(&d);
         exit(0);
      }
-      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
+      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
   }
   }   

--- a/12
+++ b/12
@ -1,13 +1,13 @@
 CC = gcc
-CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops 
+CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops

-VERSION=0.10
+VERSION=0.11

 default: test

 test: bn.o demo.o
 	$(CC) bn.o demo.o -o demo
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s

 # builds the x86 demo
 test86:
@ -22,9 +22,9 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s 
+	rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne

 zipup: clean docs
-	chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
+	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
-	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
+	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
--- a/timer.asm
+++ b/timer.asm
@ -1,34 +1,34 @@
-; Simple RDTSC reader for NASM
-;
-; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever]
-;
-; Most *nix installs use elf so it would be "nasm -f elf timer.asm"
-;
-; Tom St Denis
-[bits 32]
-[section .data]
-timer dd 0, 0
-[section .text]
-
-[global _gettsc]
-_gettsc:
-   rdtsc
-   ret
-
-[global _rdtsc]
-_rdtsc:
-   rdtsc
-   sub eax,[timer]
-   sbb edx,[timer+4]
-   ret
-
-[global _reset]
-_reset:
-   push eax
-   push edx
-   rdtsc
-   mov [timer],eax 
-   mov [timer+4],edx
-   pop edx
-   pop eax
-   ret
+; Simple RDTSC reader for NASM
+;
+; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever]
+;
+; Most *nix installs use elf so it would be "nasm -f elf timer.asm"
+;
+; Tom St Denis
+[bits 32]
+[section .data]
+timer dd 0, 0
+[section .text]
+
+[global _gettsc]
+_gettsc:
+   rdtsc
+   ret
+
+[global _rdtsc]
+_rdtsc:
+   rdtsc
+   sub eax,[timer]
+   sbb edx,[timer+4]
+   ret
+
+[global _reset]
+_reset:
+   push eax
+   push edx
+   rdtsc
+   mov [timer],eax 
+   mov [timer+4],edx
+   pop edx
+   pop eax
+   ret