added libtommath-0.17

2003-05-17 12:33:54 +00:00 · 2003-05-17 12:33:54 +00:00 · fd181cc841
commit fd181cc841
parent 14161e843e
87 changed files with 14780 additions and 6958 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
-\documentclass[]{report}
+\documentclass[]{article}
 \begin{document}

-\title{LibTomMath v0.16 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@ -27,41 +27,18 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
  int     res, neg;

  /* init all our temps */
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __B;
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
  }

  /* x == modulus, y == value to invert */
  if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (a, &y)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }

-  /* we need |y| */
-  if ((res = mp_abs (&y, &y)) != MP_OKAY) {
-    goto __D;
+  /* we need y = |a| */
+  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+    goto __ERR;
  }

  /* 2. [modified] if x,y are both even then return an error! 
@ -70,15 +47,15 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   */
  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
    res = MP_VAL;
-    goto __D;
+    goto __ERR;
  }

  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }
  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }
  mp_set (&D, 1);

@ -87,17 +64,17 @@ top:
  while (mp_iseven (&u) == 1) {
    /* 4.1 u = u/2 */
    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    /* 4.2 if A or B is odd then */
    if (mp_iseven (&B) == 0) {
      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
      }
    }
    /* B = B/2 */
    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -105,18 +82,18 @@ top:
  while (mp_iseven (&v) == 1) {
    /* 5.1 v = v/2 */
    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    /* 5.2 if C,D are even then */
    if (mp_iseven (&D) == 0) {
      /* D = (D-x)/2 */
      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
      }
    }
    /* D = D/2 */
    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -124,20 +101,20 @@ top:
  if (mp_cmp (&u, &v) != MP_LT) {
    /* u = u - v, B = B - D */
    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  } else {
    /* v - v - u, D = D - B */
    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -151,26 +128,20 @@ top:
  /* if v != 1 then there is no inverse */
  if (mp_cmp_d (&v, 1) != MP_EQ) {
    res = MP_VAL;
-    goto __D;
+    goto __ERR;
  }

  /* b is now the inverse */
  neg = a->sign;
  while (D.sign == MP_NEG) {
    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }
  mp_exch (&D, c);
  c->sign = neg;
  res = MP_OKAY;

-__D:mp_clear (&D);
-__B:mp_clear (&B);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
+__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
  return res;
 }
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@ -26,7 +26,7 @@ int
 fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
  int     ix, res, olduse;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];

  /* get old used count */
  olduse = a->used;
@ -92,7 +92,7 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)

      /* inner loop */
      for (iy = 0; iy < m->used; iy++) {
-	*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
      }
    }

--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@ -16,14 +16,16 @@

 /* Fast (comba) multiplier
 *
- * This is the fast column-array [comba] multiplier.  It is designed to compute
- * the columns of the product first then handle the carries afterwards.  This
- * has the effect of making the nested loops that compute the columns very
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
 * simple and schedulable on super-scalar processors.
 *
- * This has been modified to produce a variable number of digits of output so
- * if say only a half-product is required you don't have to compute the upper half
- * (a feature required for fast Barrett reduction).
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
 *
 * Based on Algorithm 14.12 on pp.595 of HAC.
 *
@ -32,7 +34,7 @@ int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
  int     olduse, res, pa, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];

  /* grow the destination as required */
  if (c->alloc < digs) {
@ -47,10 +49,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  /* calculate the columns */
  pa = a->used;
  for (ix = 0; ix < pa; ix++) {
-
-    /* this multiplier has been modified to allow you to control how many digits 
-     * of output are produced.  So at most we want to make upto "digs" digits
-     * of output.
+    /* this multiplier has been modified to allow you to 
+     * control how many digits of output are produced.  
+     * So at most we want to make upto "digs" digits of output.
     *
     * this adds products to distinct columns (at ix+iy) of W
     * note that each step through the loop is not dependent on
@ -73,14 +74,14 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       */
      _W = W + ix;

-      /* the number of digits is limited by their placement.  E.g. 
+      /* the number of digits is limited by their placement.  E.g.
         we avoid multiplying digits that will end up above the # of
         digits of precision requested
       */
      pb = MIN (b->used, digs - ix);

      for (iy = 0; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
      }
    }

@ -97,11 +98,12 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     * correct result we must take the extra bits from each column and
     * carry them down
     *
-     * Note that while this adds extra code to the multiplier it saves time
-     * since the carry propagation is removed from the above nested loop.
-     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-     * this is slower but on most cryptographic size numbers it is faster.
+     * Note that while this adds extra code to the multiplier it 
+     * saves time since the carry propagation is removed from the 
+     * above nested loop.This has the effect of reducing the work 
+     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+     * cost of the shifting.  On very small numbers this is slower 
+     * but on most cryptographic size numbers it is faster.
     */
    tmpc = c->dp;
    for (ix = 1; ix < digs; ix++) {
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@ -27,7 +27,7 @@ int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
  int     oldused, newused, res, pa, pb, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];

  /* calculate size of product and allocate more space if required */
  newused = a->used + b->used + 1;
@ -55,15 +55,23 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)

      /* alias for right side */
      tmpy = b->dp + iy;
-
+     
      /* alias for the columns of output.  Offset to be equal to or above the 
       * smallest digit place requested 
       */
-      _W = &(W[digs]);
+      _W = W + digs;     
+      
+      /* skip cases below zero where ix > digs */
+      if (iy < 0) {
+         iy    = abs(iy);
+         tmpy += iy;
+         _W   += iy;
+         iy    = 0;
+      }

      /* compute column products for digits above the minimum */
      for (; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
      }
    }
  }
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@ -20,7 +20,7 @@
 * then the carries are computed.  This has the effect of making a very simple
 * inner loop that is executed the most
 *
- * W2 represents the outer products and W the inner.  
+ * W2 represents the outer products and W the inner.
 *
 * A further optimizations is made because the inner products are of the form
 * "A * B * 2".  The *2 part does not need to be computed until the end which is
@ -33,7 +33,7 @@ int
 fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
  int     olduse, newused, res, ix, pa;
-  mp_word W2[512], W[512];
+  mp_word W2[MP_WARRAY], W[MP_WARRAY];

  /* calculate size of product and allocate as required */
  pa = a->used;
@ -44,9 +44,9 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
    }
  }

-  /* zero temp buffer (columns) 
+  /* zero temp buffer (columns)
   * Note that there are two buffers.  Since squaring requires
-   * a outter and inner product and the inner product requires 
+   * a outter and inner product and the inner product requires
   * computing a product and doubling it (a relatively expensive
   * op to perform n^2 times if you don't have to) the inner and
   * outer products are computed in different buffers.  This way
@ -60,7 +60,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 * values in W2 are only written in even locations which means
 * we can collapse the array to 256 words [and fixup the memset above]
 * provided we also fix up the summations below.  Ideally
- * the fixup loop should be unrolled twice to handle the even/odd 
+ * the fixup loop should be unrolled twice to handle the even/odd
 * cases, and then a final step to handle odd cases [e.g. newused == odd]
 *
 * This will not only save ~8*256 = 2KB of stack but lower the number of
@ -71,10 +71,10 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
   * the multiplication by two is done afterwards in the N loop.
   */
  for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product 
+    /* compute the outer product
     *
-     * Note that every outer product is computed 
-     * for a particular column only once which means that 
+     * Note that every outer product is computed
+     * for a particular column only once which means that
     * there is no need todo a double precision addition
     */
    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
@ -95,7 +95,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)

      /* inner products */
      for (iy = ix + 1; iy < pa; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
      }
    }
  }
--- a/bn_mp_add.c
+++ b/bn_mp_add.c
@ -24,33 +24,25 @@ mp_add (mp_int * a, mp_int * b, mp_int * c)
  sa = a->sign;
  sb = b->sign;

-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive */
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
    res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a + -b == a - b, but if b>a then we do it as -(b-a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a + b == b - a, but if a>b then we do it as -(a-b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
-    }
  } else {
-    /* -a + -b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
  }
  return res;
 }
+
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@ -21,8 +21,17 @@ mp_cmp (mp_int * a, mp_int * b)
  /* compare based on sign */
  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
    return MP_LT;
-  } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+  } 
+  
+  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
    return MP_GT;
  }
-  return mp_cmp_mag (a, b);
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
 }
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@ -23,7 +23,9 @@ mp_cmp_mag (mp_int * a, mp_int * b)
  /* compare based on # of non-zero digits */
  if (a->used > b->used) {
    return MP_GT;
-  } else if (a->used < b->used) {
+  } 
+  
+  if (a->used < b->used) {
    return MP_LT;
  }

@ -31,7 +33,9 @@ mp_cmp_mag (mp_int * a, mp_int * b)
  for (n = a->used - 1; n >= 0; n--) {
    if (a->dp[n] > b->dp[n]) {
      return MP_GT;
-    } else if (a->dp[n] < b->dp[n]) {
+    } 
+    
+    if (a->dp[n] < b->dp[n]) {
      return MP_LT;
    }
  }
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@ -31,13 +31,10 @@ mp_copy (mp_int * a, mp_int * b)
  }

  /* zero b and copy the parameters over */
-  b->used = a->used;
-  b->sign = a->sign;
-
  {
    register mp_digit *tmpa, *tmpb;

-    /* point aliases */
+    /* pointer aliases */
    tmpa = a->dp;
    tmpb = b->dp;

@ -47,9 +44,11 @@ mp_copy (mp_int * a, mp_int * b)
    }

    /* clear high digits */
-    for (; n < b->alloc; n++) {
+    for (; n < b->used; n++) {
      *tmpb++ = 0;
    }
  }
+  b->used = a->used;
+  b->sign = a->sign;
  return MP_OKAY;
 }
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@ -75,7 +75,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)

  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
  norm = mp_count_bits(&y) % DIGIT_BIT;
-  if (norm < (DIGIT_BIT-1)) {
+  if (norm < (int)(DIGIT_BIT-1)) {
     norm = (DIGIT_BIT-1) - norm;
     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
       goto __Y;
@ -86,13 +86,13 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
  } else {
     norm = 0;
  }
-     
+
  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
  n = x.used - 1;
  t = y.used - 1;

  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) {	/* y = y*b^{n-t} */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
    goto __Y;
  }

@ -113,14 +113,14 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)

    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
    if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
    } else {
      mp_word tmp;
      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
      tmp |= ((mp_word) x.dp[i - 1]);
      tmp /= ((mp_word) y.dp[t]);
      if (tmp > (mp_word) MP_MASK)
-	tmp = MP_MASK;
+        tmp = MP_MASK;
      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
    }

@ -135,7 +135,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
      t1.dp[1] = y.dp[t];
      t1.used = 2;
      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
      }

      /* find right hand */
@ -143,7 +143,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
      t2.dp[2] = x.dp[i];
      t2.used = 3;
-    } while (mp_cmp (&t1, &t2) == MP_GT);
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);

    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
@ -161,19 +161,19 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
    if (x.sign == MP_NEG) {
      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
      }
      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
      }
      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
      }

      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
    }
  }
-  
+
  /* now q is the quotient and x is the remainder [which we have to normalize] */
  /* get sign before writing to c */
  x.sign = a->sign;
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@ -34,19 +34,19 @@ mp_div_2 (mp_int * a, mp_int * b)

    /* source alias */
    tmpa = a->dp + b->used - 1;
-    
+
    /* dest alias */
    tmpb = b->dp + b->used - 1;
-    
+
    /* carry */
    r = 0;
    for (x = b->used - 1; x >= 0; x--) {
      /* get the carry for the next iteration */
      rr = *tmpa & 1;
-      
+
      /* shift the current digit, add in carry and store */
      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-      
+
      /* forward carry to next iteration */
      r = rr;
    }
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@ -51,7 +51,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
  }

  /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
+  if (b >= (int)DIGIT_BIT) {
    mp_rshd (c, b / DIGIT_BIT);
  }

@ -59,13 +59,13 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
  D = (mp_digit) (b % DIGIT_BIT);
  if (D != 0) {
    register mp_digit *tmpc, mask;
-    
+
    /* mask */
-    mask = (1U << D) - 1U;
-    
+    mask = (((mp_digit)1) << D) - 1;
+
    /* alias */
    tmpc = c->dp + (c->used - 1);
-    
+
    /* carry */
    r = 0;
    for (x = c->used - 1; x >= 0; x--) {
--- a/bn_mp_dr_is_modulus.c
+++ b/bn_mp_dr_is_modulus.c
@ -0,0 +1,34 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@ -16,7 +16,7 @@

 /* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
 *
- * Based on algorithm from the paper 
+ * Based on algorithm from the paper
 *
 * "Generating Efficient Primes for Discrete Log Cryptosystems"
 *                 Chae Hoon Lim, Pil Loong Lee,
@ -40,15 +40,15 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
      return err;
    }
  }
- 
+
  /* alias for a->dp[i] */
  tmpi = a->dp + k + k - 1;

-  /* for (i = 2k - 1; i >= k; i = i - 1) 
+  /* for (i = 2k - 1; i >= k; i = i - 1)
   *
   * This is the main loop of the reduction.  Note that at the end
   * the words above position k are not zeroed as expected.  The end
-   * result is that the digits from 0 to k-1 are the residue.  So 
+   * result is that the digits from 0 to k-1 are the residue.  So
   * we have to clear those afterwards.
   */
  for (i = k + k - 1; i >= k; i = i - 1) {
@ -57,10 +57,10 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
    /* x[i] * mp */
    r = ((mp_word) *tmpi--) * ((mp_word) mp);

-    /* now add r to x[i-1:i-k] 
+    /* now add r to x[i-1:i-k]
     *
     * First add it to the first digit x[i-k] then form the carry
-     * then enter the main loop 
+     * then enter the main loop
     */
    j = i - k;

@ -74,14 +74,14 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
    mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);

    /* clear carry from a->dp[j]  */
-    *tmpj++ &= MP_MASK; 
+    *tmpj++ &= MP_MASK;

-    /* now add rest of the digits 
-     * 
+    /* now add rest of the digits
+     *
     * Note this is basically a simple single digit addition to
     * a larger multiple digit number.  This is optimized somewhat
     * because the propagation of carries is not likely to move
-     * more than a few digits. 
+     * more than a few digits.
     *
     */
    for (++j; mu != 0 && j <= (i - 1); ++j) {
@ -99,16 +99,16 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
      *tmpj += mp;
      mu = *tmpj >> DIGIT_BIT;
      *tmpj++ &= MP_MASK;
-      
+
      /* now handle carries */
      for (++j; mu != 0 && j <= (i - 1); j++) {
-	*tmpj   += mu;
-	mu       = *tmpj >> DIGIT_BIT;
-	*tmpj++ &= MP_MASK;
+          *tmpj   += mu;
+          mu       = *tmpj >> DIGIT_BIT;
+          *tmpj++ &= MP_MASK;
      }
    }
  }
-  
+
  /* zero words above k */
  tmpi = a->dp + k;
  for (i = k; i < a->used; i++) {
@ -117,34 +117,13 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)

  /* clamp, sub and return */
  mp_clamp (a);
-  
+
+  /* if a >= b [b == modulus] then subtract the modulus to fix up */
  if (mp_cmp_mag (a, b) != MP_LT) {
    return s_mp_sub (a, b, a);
  }
  return MP_OKAY;
 }

-/* determines if a number is a valid DR modulus */
-int mp_dr_is_modulus(mp_int *a)
-{
-   int ix;
-   
-   /* must be at least two digits */
-   if (a->used < 2) {
-      return 0;
-   }      
-   
-   for (ix = 1; ix < a->used; ix++) {
-       if (a->dp[ix] != MP_MASK) {
-          return 0;
-       }
-   }
-   return 1;
-}

-/* determines the setup value */
-void mp_dr_setup(mp_int *a, mp_digit *d)
-{
-   *d = (1 << DIGIT_BIT) - a->dp[0];
-}

--- a/bn_mp_dr_setup.c
+++ b/bn_mp_dr_setup.c
@ -0,0 +1,25 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0]));
+}
+
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@ -35,11 +35,11 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
      return res;
    }

-    /* if the bit is set multiply */    
-    if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
-	mp_clear (&g);
-	return res;
+         mp_clear (&g);
+         return res;
      }
    }

--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@ -17,7 +17,7 @@
 static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);

 /* this is a shell function that calls either the normal or Montgomery
- * exptmod functions.  Originally the call to the montgomery code was 
+ * exptmod functions.  Originally the call to the montgomery code was
 * embedded in the normal function but that wasted alot of stack space
 * for nothing (since 99% of the time the Montgomery code would be called)
 */
@ -25,10 +25,46 @@ int
 mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
  int dr;
-  
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)^|X| instead of G^X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+  }
+
+
  dr = mp_dr_is_modulus(P);
  /* if the modulus is odd use the fast method */
-  if (((mp_isodd (P) == 1 && P->used < MONTGOMERY_EXPT_CUTOFF) || dr == 1) && P->used > 4) {
+  if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) {
    return mp_exptmod_fast (G, X, P, Y, dr);
  } else {
    return f_mp_exptmod (G, X, P, Y);
@ -60,11 +96,17 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    winsize = 8;
  }

+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
  /* init G array */
  for (x = 0; x < (1 << winsize); x++) {
    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
      for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
+        mp_clear (&M[y]);
      }
      return err;
    }
@ -78,7 +120,7 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    goto __MU;
  }

-  /* create M table 
+  /* create M table
   *
   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
   *
@ -119,30 +161,29 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
  mp_set (&res, 1);

  /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
  digidx = X->used - 1;
  bitcpy = bitbuf = 0;

-  bitcnt = 1;
  for (;;) {
    /* grab next digit as required */
    if (--bitcnt == 0) {
      if (digidx == -1) {
-	break;
+        break;
      }
      buf = X->dp[digidx--];
      bitcnt = (int) DIGIT_BIT;
    }

    /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;

-    /* if the bit is zero and mode == 0 then we ignore it 
+    /* if the bit is zero and mode == 0 then we ignore it
     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it 
+     * in the exponent.  Technically this opt is not required but it
     * does lower the # of trivial squaring/reductions used
     */
    if (mode == 0 && y == 0)
@ -151,10 +192,10 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      continue;
    }
@ -167,20 +208,20 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
      /* ok window is filled so square as required and multiply  */
      /* square first */
      for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
      }

      /* then multiply */
      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __MU;
+        goto __MU;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __MU;
+        goto __MU;
      }

      /* empty window and reset */
@ -194,21 +235,21 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }

      bitbuf <<= 1;
      if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
      }
    }
  }
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@ -19,7 +19,7 @@
 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
 * The value of k changes based on the size of the exponent.
 *
- * Uses Montgomery or Diminished Radix reduction [whichever appropriate] 
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
 */
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
@ -28,7 +28,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  mp_digit buf, mp;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
  int     (*redux)(mp_int*,mp_int*,mp_digit);
-  
+
  /* find window size */
  x = mp_count_bits (X);
  if (x <= 7) {
@ -47,22 +47,37 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    winsize = 8;
  }

+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+
  /* init G array */
  for (x = 0; x < (1 << winsize); x++) {
    if ((err = mp_init (&M[x])) != MP_OKAY) {
      for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
+        mp_clear (&M[y]);
      }
      return err;
    }
  }
-  
+
  if (redmode == 0) {
     /* now setup montgomery  */
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
        goto __M;
     }
-     redux = mp_montgomery_reduce;
+     
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+     if ( ((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else {
+        /* use slower baselien method */
+        redux = mp_montgomery_reduce;
+     }
  } else {
     /* setup DR reduction */
     mp_dr_setup(P, &mp);
@ -97,7 +112,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
        goto __RES;
     }
  }
-  
+
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
    goto __RES;
@ -123,42 +138,42 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  }

  /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
  digidx = X->used - 1;
  bitcpy = bitbuf = 0;

-  bitcnt = 1;
  for (;;) {
    /* grab next digit as required */
    if (--bitcnt == 0) {
      if (digidx == -1) {
-	break;
+        break;
      }
      buf = X->dp[digidx--];
      bitcnt = (int) DIGIT_BIT;
    }

    /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
+    y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;

    /* if the bit is zero and mode == 0 then we ignore it
     * These represent the leading zero bits before the first 1 bit
     * in the exponent.  Technically this opt is not required but it
     * does lower the # of trivial squaring/reductions used
     */
-    if (mode == 0 && y == 0)
+    if (mode == 0 && y == 0) {
      continue;
+    }

    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      continue;
    }
@ -171,20 +186,20 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      /* ok window is filled so square as required and multiply  */
      /* square first */
      for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
      }

      /* then multiply */
      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }

      /* empty window and reset */
@ -198,21 +213,21 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }
      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
      }

      bitbuf <<= 1;
      if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
      }
    }
  }
@ -222,7 +237,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
       goto __RES;
     }
-  }     
+  }

  mp_exch (&res, Y);
  err = MP_OKAY;
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@ -82,18 +82,18 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
    /* B3 (and B4).  Halve t, if even */
    while (t.used != 0 && mp_iseven(&t) == 1) {
      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-	goto __T;
+        goto __T;
      }
    }

    /* B5.  if t>0 then u=t otherwise v=-t */
    if (t.used != 0 && t.sign != MP_NEG) {
      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-	goto __T;
+        goto __T;
      }
    } else {
      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-	goto __T;
+        goto __T;
      }
      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
    }
@ -102,9 +102,9 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
      goto __T;
    }
-  }
-  while (t.used != 0);
+  } while (mp_iszero(&t) == 0);

+  /* multiply by 2^k which we divided out at the beginning */ 
  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
    goto __T;
  }
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@ -18,12 +18,12 @@
 int
 mp_grow (mp_int * a, int size)
 {
-  int     i, n;
+  int     i;

  /* if the alloc size is smaller alloc more ram */
  if (a->alloc < size) {
    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));     

    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
    if (a->dp == NULL) {
@ -31,9 +31,9 @@ mp_grow (mp_int * a, int size)
    }

    /* zero excess digits */
-    n = a->alloc;
+    i        = a->alloc;
    a->alloc = size;
-    for (i = n; i < a->alloc; i++) {
+    for (; i < a->alloc; i++) {
      a->dp[i] = 0;
    }
  }
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -18,7 +18,6 @@
 int
 mp_init (mp_int * a)
 {
-
  /* allocate ram required and clear it */
  a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
  if (a->dp == NULL) {
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@ -29,63 +29,36 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
  if (mp_iseven (b) == 0) {
    return fast_mp_invmod (a, b, c);
  }
-
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&A)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __A;
-  }
-
-  if ((res = mp_init (&C)) != MP_OKAY) {
-    goto __B;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __C;
+  
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
  }

  /* x = a, y = b */
  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }
  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }

  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }

  /* 2. [modified] if x,y are both even then return an error! */
  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
    res = MP_VAL;
-    goto __D;
+    goto __ERR;
  }

  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }
  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
  }
  mp_set (&A, 1);
  mp_set (&D, 1);
@ -96,24 +69,24 @@ top:
  while (mp_iseven (&u) == 1) {
    /* 4.1 u = u/2 */
    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    /* 4.2 if A or B is odd then */
    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
      /* A = (A+y)/2, B = (B-x)/2 */
      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
      }
      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
      }
    }
    /* A = A/2, B = B/2 */
    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -122,24 +95,24 @@ top:
  while (mp_iseven (&v) == 1) {
    /* 5.1 v = v/2 */
    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    /* 5.2 if C,D are even then */
    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
      /* C = (C+y)/2, D = (D-x)/2 */
      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
      }
      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
      }
    }
    /* C = C/2, D = D/2 */
    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -147,28 +120,28 @@ top:
  if (mp_cmp (&u, &v) != MP_LT) {
    /* u = u - v, A = A - C, B = B - D */
    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  } else {
    /* v - v - u, C = C - A, D = D - B */
    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }

    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
    }
  }

@ -181,21 +154,13 @@ top:
  /* if v != 1 then there is no inverse */
  if (mp_cmp_d (&v, 1) != MP_EQ) {
    res = MP_VAL;
-    goto __D;
+    goto __ERR;
  }

  /* a is now the inverse */
  mp_exch (&C, c);
  res = MP_OKAY;

-__D:mp_clear (&D);
-__C:mp_clear (&C);
-__B:mp_clear (&B);
-__A:mp_clear (&A);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
+__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
  return res;
 }
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* computes the jacobi c = (a | n) (or Legendre if b is prime)
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
 * HAC pp. 73 Algorithm 2.149
 */
 int
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -36,7 +36,7 @@
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
  int     B, err;

  err = MP_MEM;
@ -60,10 +60,8 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
  /* init temps */
  if (mp_init_size (&t1, B * 2) != MP_OKAY)
    goto Y1;
-  if (mp_init_size (&t2, B * 2) != MP_OKAY)
-    goto T1;
  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-    goto T2;
+    goto T1;
  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
    goto X0Y0;

@ -110,41 +108,40 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
  mp_clamp (&y0);

  /* now calc the products x0y0 and x1y1 */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
-    goto X1Y1;			/* x0y0 = x0*y0 */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  /* after this x0 is no longer required, free temp [x0==t2]! */
+    goto X1Y1;          /* x0y0 = x0*y0 */
  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1*y1 */
+    goto X1Y1;          /* x1y1 = x1*y1 */

  /* now calc x1-x0 and y1-y0 */
  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = y1 - y0 */
-  if (mp_mul (&t1, &t2, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */

  /* add x0y0 */
-  if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = x0y0 + x1y1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */

  /* shift by B */
  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1y1 << 2*B */
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */

  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 */
+    goto X1Y1;          /* t1 = x0y0 + t1 */
  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 + x1y1 */
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */

  err = MP_OKAY;

 X1Y1:mp_clear (&x1y1);
 X0Y0:mp_clear (&x0y0);
-T2:mp_clear (&t2);
 T1:mp_clear (&t1);
 Y1:mp_clear (&y1);
 Y0:mp_clear (&y0);
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@ -74,32 +74,32 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)

  /* now calc the products x0*x0 and x1*x1 */
  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;			/* x0x0 = x0*x0 */
+    goto X1X1;                  /* x0x0 = x0*x0 */
  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;			/* x1x1 = x1*x1 */
+    goto X1X1;                  /* x1x1 = x1*x1 */

-  /* now calc x1-x0 and y1-y0 */
+  /* now calc (x1-x0)^2 */
  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x1 - x0 */
+    goto X1X1;                  /* t1 = x1 - x0 */
  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1X1;                  /* t1 = (x1 - x0) * (x1 - x0) */

  /* add x0y0 */
  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;			/* t2 = x0y0 + x1y1 */
+    goto X1X1;                  /* t2 = x0y0 + x1y1 */
  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+    goto X1X1;                  /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */

  /* shift by B */
  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1X1;                  /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;			/* x1y1 = x1y1 << 2*B */
+    goto X1X1;                  /* x1y1 = x1y1 << 2*B */

  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 */
+    goto X1X1;                  /* t1 = x0y0 + t1 */
  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 + x1y1 */
+    goto X1X1;                  /* t1 = x0y0 + t1 + x1y1 */

  err = MP_OKAY;

--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@ -20,15 +20,16 @@ mp_lshd (mp_int * a, int b)
 {
  int     x, res;

-
  /* if its less than zero return */
  if (b <= 0) {
    return MP_OKAY;
  }

  /* grow to fit the new digits */
-  if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
-    return res;
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
  }

  {
--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@ -15,10 +15,10 @@
 #include <tommath.h>

 /* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].  
+ * Where B is the base [e.g. 2^DIGIT_BIT].
 * B^n mod b is computed by first computing
 * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT 
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT
 * shifts with subtractions when the result is greater than b.
 *
 * The method is slightly modified to shift B unconditionally upto just under
@ -38,13 +38,13 @@ mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
  }

  /* now compute C = A * B mod b */
-  for (x = bits - 1; x < DIGIT_BIT; x++) {
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
      return res;
    }
    if (mp_cmp_mag (a, b) != MP_LT) {
      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
-	return res;
+        return res;
      }
    }
  }
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@ -21,12 +21,19 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
  int     ix, res, digs;
  mp_digit ui;

+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mp_mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
  digs = m->used * 2 + 1;
-  if ((digs < 512)
-      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+  if ((digs < MP_WARRAY)
+      && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
    return fast_mp_montgomery_reduce (a, m, mp);
  }

+  /* grow the input as required */
  if (a->alloc < m->used * 2 + 1) {
    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
      return res;
@ -50,15 +57,15 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)

      mu = 0;
      for (iy = 0; iy < m->used; iy++) {
-	r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
-	mu = (r >> ((mp_word) DIGIT_BIT));
-	*tmpy++ = (r & ((mp_word) MP_MASK));
+        r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
+        mu = (r >> ((mp_word) DIGIT_BIT));
+        *tmpy++ = (r & ((mp_word) MP_MASK));
      }
      /* propagate carries */
      while (mu) {
-	*tmpy += mu;
-	mu = (*tmpy >> DIGIT_BIT) & 1;
-	*tmpy++ &= MP_MASK;
+        *tmpy += mu;
+        mu = (*tmpy >> DIGIT_BIT) & 1;
+        *tmpy++ &= MP_MASK;
      }
    }
  }
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@ -18,11 +18,11 @@
 int
 mp_montgomery_setup (mp_int * a, mp_digit * mp)
 {
-  unsigned long x, b;
+  mp_digit x, b;

-/* fast inversion mod 2^32 
+/* fast inversion mod 2^k
 *
- * Based on the fact that 
+ * Based on the fact that
 *
 * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
 *                   =>  2*X*A - X*X*A*A = 1
@ -34,13 +34,20 @@ mp_montgomery_setup (mp_int * a, mp_digit * mp)
    return MP_VAL;
  }

-  x = (((b + 2) & 4) << 1) + b;	/* here x*a==1 mod 2^4 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^8 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^16; each step doubles the nb of bits */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^32 */
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2^8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2^16; each step doubles the nb of bits */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2^32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2^64 */
+#endif

  /* t = -1/m mod b */
-  *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - (x & MP_MASK);
+  *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;

  return MP_OKAY;
 }
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -24,15 +24,15 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
    res = mp_karatsuba_mul (a, b, c);
  } else {

-    /* can we use the fast multiplier? 
+    /* can we use the fast multiplier?
     *
-     * The fast multiplier can be used if the output will have less than 
-     * 512 digits and the number of digits won't affect carry propagation
+     * The fast multiplier can be used if the output will have less than
+     * MP_WARRAY digits and the number of digits won't affect carry propagation
     */
    int     digs = a->used + b->used + 1;

-    if ((digs < 512)
-	&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    if ((digs < MP_WARRAY)
+        && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
      res = fast_s_mp_mul_digs (a, b, c, digs);
    } else {
      res = s_mp_mul (a, b, c);
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@ -20,10 +20,9 @@ mp_mul_2 (mp_int * a, mp_int * b)
 {
  int     x, res, oldused;

-  /* Optimization: should copy and shift at the same time */
-
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
      return res;
    }
  }
@ -31,7 +30,6 @@ mp_mul_2 (mp_int * a, mp_int * b)
  oldused = b->used;
  b->used = a->used;

-  /* shift any bit count < DIGIT_BIT */
  {
    register mp_digit r, rr, *tmpa, *tmpb;

@ -43,37 +41,32 @@ mp_mul_2 (mp_int * a, mp_int * b)

    /* carry */
    r = 0;
-    for (x = 0; x < b->used; x++) {
+    for (x = 0; x < a->used; x++) {
    
-      /* get what will be the *next* carry bit from the MSB of the current digit */
-      rr = *tmpa >> (DIGIT_BIT - 1);
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
      
      /* now shift up this digit, add in the carry [from the previous] */
-      *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
      
-      /* copy the carry that would be from the source digit into the next iteration */
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
      r = rr;
    }

    /* new leading digit? */
    if (r != 0) {
-      /* do we have to grow to accomodate the new digit? */
-      if (b->alloc == b->used) {
-	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
-	  return res;
-	}
-
-	/* after the grow *tmpb is no longer valid so we have to reset it! 
-	 * (this bug took me about 17 minutes to find...!)
-	 */
-	tmpb = b->dp + b->used;
-      }
      /* add a MSB which is always 1 at this point */
      *tmpb = 1;
      ++b->used;
    }

-    /* now zero any excess digits on the destination that we didn't write to */
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
    tmpb = b->dp + b->used;
    for (x = b->used; x < oldused; x++) {
      *tmpb++ = 0;
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@ -14,24 +14,34 @@
 */
 #include <tommath.h>

+/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
+   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
+ 
+   To be fixed for LTM 0.18
+ */
+
 /* shift left by a certain bit count */
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
 {
-  mp_digit d, r, rr;
-  int     x, res;
+  mp_digit d;
+  int      res;

  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
  }

-  if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+       return res;
+     }
  }

  /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
+  if (b >= (int)DIGIT_BIT) {
    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
      return res;
    }
@ -41,14 +51,15 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
  /* shift any bit count < DIGIT_BIT */
  d = (mp_digit) (b % DIGIT_BIT);
  if (d != 0) {
-    register mp_digit *tmpc, mask;
-    
+    register mp_digit *tmpc, mask, r, rr;
+    register int x;
+
    /* bitmask for carries */
-    mask = (1U << d) - 1U;
-    
+    mask = (((mp_digit)1) << d) - 1;
+
    /* alias */
    tmpc = c->dp;
-    
+
    /* carry */
    r    = 0;
    for (x = 0; x < c->used; x++) {
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@ -20,6 +20,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
  int     res, pa, olduse;

+  /* make sure c is big enough to hold a*b */
  pa = a->used;
  if (c->alloc < pa + 1) {
    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
@ -27,7 +28,10 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
    }
  }

+  /* get the original destinations used count */
  olduse = c->used;
+
+  /* set the new temporary used count */
  c->used = pa + 1;

  {
@ -35,21 +39,31 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
    register mp_word r;
    register int ix;

-    tmpc = c->dp + c->used;
-    for (ix = c->used; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-
+    /* alias for a->dp [source] */
    tmpa = a->dp;
+
+    /* alias for c->dp [dest] */
    tmpc = c->dp;

+    /* zero carry */
    u = 0;
    for (ix = 0; ix < pa; ix++) {
+      /* compute product and carry sum for this term */
      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+
+      /* mask off higher bits to get a single digit */
      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* send carry into next iteration */
      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
-    *tmpc = u;
+    /* store final carry [if any] */
+    *tmpc++ = u;
+
+    /* now zero digits above the top */
+    for (; pa < olduse; pa++) {
+       *tmpc++ = 0;
+    }
  }

  mp_clamp (c);
--- a/bn_mp_multi.c
+++ b/bn_mp_multi.c
@ -0,0 +1,64 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
--- a/bn_mp_prime_is_divisible.c
+++ b/bn_mp_prime_is_divisible.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* determines if an integers is divisible by one of the first 256 primes or not 
+/* determines if an integers is divisible by one of the first 256 primes or not
 *
 * sets result to 0 if not, 1 if yes
 */
@ -27,7 +27,7 @@ mp_prime_is_divisible (mp_int * a, int *result)
  /* default to not */
  *result = 0;

-  for (ix = 0; ix < 256; ix++) {
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
    /* is it equal to the prime? */
    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
      *result = 1;
--- a/bn_mp_prime_is_prime.c
+++ b/bn_mp_prime_is_prime.c
@ -31,10 +31,18 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
  *result = 0;

  /* valid value of t? */
-  if (t < 1 || t > 256) {
+  if (t < 1 || t > PRIME_SIZE) {
    return MP_VAL;
  }

+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
  /* first perform trial division */
  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
    return err;
--- a/bn_mp_prime_next_prime.c
+++ b/bn_mp_prime_next_prime.c
@ -20,35 +20,35 @@
 int mp_prime_next_prime(mp_int *a, int t)
 {
   int err, res;
-   
+
   if (mp_iseven(a) == 1) {
      /* force odd */
      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
         return err;
      }
   } else {
-      /* force to next number */
+      /* force to next odd number */
      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
         return err;
      }
-   }     
-   
+   }
+
   for (;;) {
      /* is this prime? */
      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
         return err;
      }
-      
+
      if (res == 1) {
         break;
      }
-      
+
      /* add two, next candidate */
      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
         return err;
      }
   }
-   
+
   return MP_OKAY;
 }

--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@ -21,8 +21,7 @@ int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
  int     res;
-
-
+  
  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
    return res;
  }
@ -30,8 +29,8 @@ mp_reduce_setup (mp_int * a, mp_int * b)
  return res;
 }

-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
- * From HAC pp.604 Algorithm 14.42 
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup
+ * From HAC pp.604 Algorithm 14.42
 */
 int
 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
@ -39,15 +38,15 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
  mp_int  q;
  int     res, um = m->used;

-
  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
    return res;
  }

-  mp_rshd (&q, um - 1);		/* q1 = x / b^(k-1)  */
+  /* q1 = x / b^(k-1)  */
+  mp_rshd (&q, um - 1);         

  /* according to HAC this is optimization is ok */
-  if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) {
+  if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
      goto CLEANUP;
    }
@ -57,7 +56,8 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
    }
  }

-  mp_rshd (&q, um + 1);		/* q3 = q2 / b^(k+1) */
+  /* q3 = q2 / b^(k+1) */
+  mp_rshd (&q, um + 1);         

  /* x = x mod b^(k+1), quick (no division) */
  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
@ -70,8 +70,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
  }

  /* x = x - q */
-  if ((res = mp_sub (x, &q, x)) != MP_OKAY)
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
    goto CLEANUP;
+  }

  /* If x < 0, add b^(k+1) to it */
  if (mp_cmp_d (x, 0) == MP_LT) {
@ -84,8 +85,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)

  /* Back off if it's too big */
  while (mp_cmp (x, m) != MP_LT) {
-    if ((res = s_mp_sub (x, m, x)) != MP_OKAY)
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
      break;
+    }
  }

 CLEANUP:
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@ -26,7 +26,7 @@ mp_rshd (mp_int * a, int b)
  }

  /* if b > used then simply zero it and return */
-  if (a->used < b) {
+  if (a->used <= b) {
    mp_zero (a);
    return;
  }
@ -42,8 +42,9 @@ mp_rshd (mp_int * a, int b)
    /* offset into digits */
    tmpaa = a->dp + b;

-    /* this is implemented as a sliding window where the window is b-digits long
-     * and digits from the top of the window are copied to the bottom
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
     *
     * e.g.

--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@ -16,15 +16,13 @@

 /* set a 32-bit const */
 int
-mp_set_int (mp_int * a, unsigned long b)
+mp_set_int (mp_int * a, unsigned int b)
 {
  int     x, res;

  mp_zero (a);
-
-  /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
+  /* set four bits at a time */
  for (x = 0; x < 8; x++) {
-
    /* shift the number up four bits */
    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
      return res;
@ -37,9 +35,8 @@ mp_set_int (mp_int * a, unsigned long b)
    b <<= 4;

    /* ensure that digits are not clamped off */
-    a->used += 32 / DIGIT_BIT + 1;
+    a->used += 32 / DIGIT_BIT + 2;
  }
-
  mp_clamp (a);
  return MP_OKAY;
 }
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@ -24,8 +24,7 @@ mp_sqr (mp_int * a, mp_int * b)
  } else {

    /* can we use the fast multiplier? */
-    if (((a->used * 2 + 1) < 512)
-	&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
+    if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
      res = fast_s_mp_sqr (a, b);
    } else {
      res = s_mp_sqr (a, b);
--- a/bn_mp_sub.c
+++ b/bn_mp_sub.c
@ -20,39 +20,34 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
  int     sa, sb, res;

-
  sa = a->sign;
  sb = b->sign;

-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive, a - b, but if b>a then we do -(b - a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      /* b>a */
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a - -b == a + b  */
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
    res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a - b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
  } else {
-    /* -a - -b == b - a, but if a>b == -(a - b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
      res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
    } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
      res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
    }
  }
-
  return res;
 }
+
--- a/bn_prime_tab.c
+++ b/bn_prime_tab.c
@ -17,7 +17,9 @@ const mp_digit __prime_tab[] = {
  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
@ -49,4 +51,5 @@ const mp_digit __prime_tab[] = {
  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
 };
--- a/bn_radix.c
+++ b/bn_radix.c
@ -135,3 +135,80 @@ mp_radix_size (mp_int * a, int radix)
  mp_clear (&t);
  return digs + 1;
 }
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
--- a/bn_reverse.c
+++ b/bn_reverse.c
@ -24,7 +24,7 @@ bn_reverse (unsigned char *s, int len)
  ix = 0;
  iy = len - 1;
  while (ix < iy) {
-    t = s[ix];
+    t     = s[ix];
    s[ix] = s[iy];
    s[iy] = t;
    ++ix;
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@ -28,13 +28,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
    min = b->used;
    max = a->used;
    x = a;
-  } else if (a->used < b->used) {
+  } else {
    min = a->used;
    max = b->used;
    x = b;
-  } else {
-    min = max = a->used;
-    x = NULL;
  }

  /* init result */
@ -44,11 +41,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
    }
  }

+  /* get old used digit count and set new one */
  olduse = c->used;
  c->used = max + 1;

-  /* add digits from lower part */
-
  /* set the carry to zero */
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
@ -65,36 +61,39 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
    /* destination */
    tmpc = c->dp;

+    /* zero the carry */
    u = 0;
    for (i = 0; i < min; i++) {
      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
      *tmpc = *tmpa++ + *tmpb++ + u;

      /* U = carry bit of T[i] */
-      u = *tmpc >> DIGIT_BIT;
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);

      /* take away carry bit from T[i] */
      *tmpc++ &= MP_MASK;
    }

-    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
    if (min != max) {
      for (; i < max; i++) {
-	/* T[i] = X[i] + U */
-	*tmpc = x->dp[i] + u;
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;

-	/* U = carry bit of T[i] */
-	u = *tmpc >> DIGIT_BIT;
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);

-	/* take away carry bit from T[i] */
-	*tmpc++ &= MP_MASK;
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
      }
    }

    /* add carry */
    *tmpc++ = u;

-    /* clear digits above used (since we may not have grown result above) */
+    /* clear digits above oldused */
    for (i = c->used; i < olduse; i++) {
      *tmpc++ = 0;
    }
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@ -15,8 +15,8 @@
 #include <tommath.h>

 /* multiplies |a| * |b| and only computes upto digs digits of result
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
- * output are created.  
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
 */
 int
 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
@ -27,6 +27,13 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  mp_word r;
  mp_digit tmpx, *tmpt, *tmpy;

+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
    return res;
  }
@ -42,14 +49,21 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
    pb = MIN (b->used, digs - ix);

    /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
    tmpx = a->dp[ix];
-    tmpt = &(t.dp[ix]);
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
    tmpy = b->dp;

    /* compute the columns of the output and propagate the carry */
    for (iy = 0; iy < pb; iy++) {
      /* compute the column as a mp_word */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+      r = ((mp_word) *tmpt) + 
+          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word) u);

      /* the new column is the lower part of the result */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
@ -57,8 +71,10 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      /* get the carry word from the result */
      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
-    if (ix + iy < digs)
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
      *tmpt = u;
+    }
  }

  mp_clamp (&t);
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* multiplies |a| * |b| and does not compute the lower digs digits 
+/* multiplies |a| * |b| and does not compute the lower digs digits
 * [meant to get the higher part of the product]
 */
 int
@ -28,8 +28,8 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)


  /* can we use the fast multiplier? */
-  if (((a->used + b->used + 1) < 512)
-      && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
    return fast_s_mp_mul_high_digs (a, b, c, digs);
  }

--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
 int
 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
@ -34,7 +34,6 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  c->used = max;

  /* sub digits from lower part */
-
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
    register int i;
@ -50,12 +49,12 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
      /* T[i] = A[i] - B[i] - U */
      *tmpc = *tmpa++ - *tmpb++ - u;

-      /* U = carry bit of T[i] 
-       * Note this saves performing an AND operation since 
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
       * if a carry does occur it will propagate all the way to the
       * MSB.  As a result a single shift is required to get the carry
       */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));

      /* Clear carry from T[i] */
      *tmpc++ &= MP_MASK;
@ -67,7 +66,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
      *tmpc = *tmpa++ - u;

      /* U = carry bit of T[i] */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));

      /* Clear carry from T[i] */
      *tmpc++ &= MP_MASK;
--- a/bncore.c
+++ b/bncore.c
@ -14,7 +14,15 @@
 */
 #include <tommath.h>

-/* configured for a AMD Duron Morgan core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 73,	/* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 121,	/* Min. number of digits before Karatsuba squaring is used. */
-        MONTGOMERY_EXPT_CUTOFF = 128;	/* max. number of digits that montgomery reductions will help for */
+/* Known optimal configurations
+
+ CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
+-------------------------------------------------------------
+ Intel P4               /GCC v3.2     /        81/       110
+ AMD Athlon XP          /GCC v3.2     /       109/       127
+
+*/
+
+/* configured for a AMD XP Thoroughbred core with etc/tune.c */
+int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 127;      /* Min. number of digits before Karatsuba squaring is used. */
--- a/booker.pl
+++ b/booker.pl
@ -0,0 +1,261 @@
+#!/bin/perl
+#
+#Used to prepare the book "tommath.src" for LaTeX by pre-processing it into a .tex file
+#
+#Essentially you write the "tommath.src" as normal LaTex except where you want code snippets you put
+#
+#EXAM,file
+#
+#This preprocessor will then open "file" and insert it as a verbatim copy.
+#
+#Tom St Denis
+
+#get graphics type
+if (shift =~ /PDF/) {
+   $graph = "";
+} else {
+   $graph = ".ps";
+}   
+
+open(IN,"<tommath.src") or die "Can't open source file";
+open(OUT,">tommath.tex") or die "Can't open destination file";
+
+print "Scanning for sections\n";
+$chapter = $section = $subsection = 0;
+$x = 0;
+while (<IN>) {
+   print ".";
+   if (!(++$x % 80)) { print "\n"; }
+   #update the headings 
+   if (~($_ =~ /\*/)) {
+      if ($_ =~ /\\chapter{.+}/) {
+          ++$chapter;
+          $section = $subsection = 0;
+      } elsif ($_ =~ /\\section{.+}/) {
+          ++$section;
+          $subsection = 0;
+      } elsif ($_ =~ /\\subsection{.+}/) {
+          ++$subsection;
+      }
+   }      
+
+   if ($_ =~ m/MARK/) {
+      @m = split(",",$_);
+      chomp(@m[1]);
+      $index1{@m[1]} = $chapter;
+      $index2{@m[1]} = $section;
+      $index3{@m[1]} = $subsection;
+   }
+}
+close(IN);
+
+open(IN,"<tommath.src") or die "Can't open source file";
+$readline = $wroteline = 0;
+$srcline = 0;
+
+while (<IN>) {
+   ++$readline;
+   ++$srcline;
+   
+   if ($_ =~ m/MARK/) {
+   } elsif ($_ =~ m/EXAM/ || $_ =~ m/LIST/) {
+      if ($_ =~ m/EXAM/) {
+         $skipheader = 1;
+      } else {
+         $skipheader = 0;
+      }
+      
+      # EXAM,file
+      chomp($_);
+      @m = split(",",$_);
+      open(SRC,"<$m[1]") or die "Error:$srcline:Can't open source file $m[1]";
+      
+      print "$srcline:Inserting $m[1]:";
+      
+      $line = 0;
+      $tmp = $m[1];
+      $tmp =~ s/_/"\\_"/ge;
+      print OUT "\\index{$tmp}\n\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
+      $wroteline += 5;
+      
+      if ($skipheader == 1) {
+         # scan till next end of comment, e.g. skip license 
+         while (<SRC>) {
+            $text[$line++] = $_;
+            last if ($_ =~ /tommath\.h/);
+         }
+      }
+      
+      $inline = 0;
+      while (<SRC>) {
+         $text[$line++] = $_;
+         ++$inline;
+         chomp($_);
+         $_ =~ s/\t/"    "/ge;
+         $_ =~ s/{/"^{"/ge;
+         $_ =~ s/}/"^}"/ge;
+         $_ =~ s/\\/'\symbol{92}'/ge;
+         $_ =~ s/\^/"\\"/ge;
+           
+         printf OUT ("%03d   ", $line);
+         for ($x = 0; $x < length($_); $x++) {
+             print OUT chr(vec($_, $x, 8));
+             if ($x == 75) { 
+                 print OUT "\n      ";
+                 ++$wroteline;
+             }
+         }
+         print OUT "\n";
+         ++$wroteline;
+      }
+      $totlines = $line;
+      print OUT "\\end{alltt}\n\\end{small}\n";
+      close(SRC);
+      print "$inline lines\n";
+      $wroteline += 2;
+   } elsif ($_ =~ m/@\d+,.+@/) {
+     # line contains [number,text]
+     # e.g. @14,for (ix = 0)@
+     $txt = $_;
+     while ($txt =~ m/@\d+,.+@/) {
+        @m = split("@",$txt);      # splits into text, one, two
+        @parms = split(",",$m[1]);  # splits one,two into two elements 
+                
+        # now search from $parms[0] down for $parms[1] 
+        $found1 = 0;
+        $found2 = 0;
+        for ($i = $parms[0]; $i < $totlines && $found1 == 0; $i++) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline1 = $i + 1;
+              $found1 = 1;
+           }
+        }
+        
+        # now search backwards
+        for ($i = $parms[0] - 1; $i >= 0 && $found2 == 0; $i--) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline2 = $i + 1;
+              $found2 = 1;
+           }
+        }
+        
+        # now use the closest match or the first if tied
+        if ($found1 == 1 && $found2 == 0) {
+           $found = 1;
+           $foundline = $foundline1;
+        } elsif ($found1 == 0 && $found2 == 1) {
+           $found = 1;
+           $foundline = $foundline2;
+        } elsif ($found1 == 1 && $found2 == 1) {
+           $found = 1;
+           if (($foundline1 - $parms[0]) <= ($parms[0] - $foundline2)) {
+              $foundline = $foundline1;
+           } else {
+              $foundline = $foundline2;
+           }
+        } else {
+           $found = 0;
+        }
+                      
+        # if found replace 
+        if ($found == 1) {
+           $delta = $parms[0] - $foundline;
+           print "Found replacement tag for \"$parms[1]\" on line $srcline which refers to line $foundline (delta $delta)\n";
+           $_ =~ s/@\Q$m[1]\E@/$foundline/;
+        } else {
+           print "ERROR:  The tag \"$parms[1]\" on line $srcline was not found in the most recently parsed source!\n";
+        }
+        
+        # remake the rest of the line 
+        $cnt = @m;
+        $txt = "";
+        for ($i = 2; $i < $cnt; $i++) {
+            $txt = $txt . $m[$i] . "@";
+        }
+     }
+     print OUT $_;
+     ++$wroteline;
+   } elsif ($_ =~ /~.+~/) {
+      # line contains a ~text~ pair used to refer to indexing :-)
+      $txt = $_;
+      while ($txt =~ /~.+~/) {
+         @m = split("~", $txt);
+         
+         # word is the second position
+         $word = @m[1];
+         $a = $index1{$word};
+         $b = $index2{$word};
+         $c = $index3{$word};
+         
+         # if chapter (a) is zero it wasn't found
+         if ($a == 0) {
+            print "ERROR: the tag \"$word\" on line $srcline was not found previously marked.\n";
+         } else {
+            # format the tag as x, x.y or x.y.z depending on the values
+            $str = $a;
+            $str = $str . ".$b" if ($b != 0);
+            $str = $str . ".$c" if ($c != 0);
+            
+            if ($b == 0 && $c == 0) {
+               # its a chapter
+               if ($a <= 10) {
+                  if ($a == 1) {
+                     $str = "chapter one";
+                  } elsif ($a == 2) {
+                     $str = "chapter two";
+                  } elsif ($a == 3) {
+                     $str = "chapter three";
+                  } elsif ($a == 4) {
+                     $str = "chapter four";
+                  } elsif ($a == 5) {
+                     $str = "chapter five";
+                  } elsif ($a == 6) {
+                     $str = "chapter six";
+                  } elsif ($a == 7) {
+                     $str = "chapter seven";
+                  } elsif ($a == 8) {
+                     $str = "chapter eight";
+                  } elsif ($a == 9) {
+                     $str = "chapter nine";
+                  } elsif ($a == 2) {
+                     $str = "chapter ten";
+                  }
+               } else {
+                  $str = "chapter " . $str;
+               }
+            } else {
+               $str = "section " . $str     if ($b != 0 && $c == 0);            
+               $str = "sub-section " . $str if ($b != 0 && $c != 0);
+            }
+            
+            #substitute
+            $_ =~ s/~\Q$word\E~/$str/;
+            
+            print "Found replacement tag for marker \"$word\" on line $srcline which refers to $str\n";
+         }
+         
+         # remake rest of the line
+         $cnt = @m;
+         $txt = "";
+         for ($i = 2; $i < $cnt; $i++) {
+             $txt = $txt . $m[$i] . "~";
+         }
+      }
+      print OUT $_;
+      ++$wroteline;
+   } elsif ($_ =~ m/FIGU/) {
+      # FIGU,file,caption
+      chomp($_);
+      @m = split(",", $_);
+      print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n";
+      print OUT "\\caption{$m[2]}\n\\end{figure}\n\\end{center}\n";
+      $wroteline += 4;
+   } else {
+      print OUT $_;
+      ++$wroteline;
+   }
+}
+print "Read $readline lines, wrote $wroteline lines\n";
+
+close (OUT);
+close (IN);
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,37 @@
+May 17th, 2003
+v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
+          as several smaller suggestions.  Thanks!
+       -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-)
+       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
+       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
+       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
+       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the 
+          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed) 
+       -- Refactored the DR reduction code so there is only one function per file.
+       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
+          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
+       -- Fixed similar bug in the Montgomery reduction code.
+       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. 
+          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
+       -- Added new comments here and there, cleaned up some code [style stuff]
+       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
+       -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed.  This
+          saves quite a few calls and if statements.
+       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
+       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
+          uses RDTSC for high precision timing.  
+       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its 
+          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
+       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files 
+          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the 
+          root of the package.
+       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].  
+       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
+       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
+          shifting caused a carry.  
+       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
+          slower than it had to be.
+          
 Mar 29th, 2003
 v0.16  -- Sped up mp_div by making normalization one shift call
       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
--- a/demo/demo.c
+++ b/demo/demo.c
@ -1,21 +1,6 @@
 #include <time.h>

-
-#ifdef U_MPI
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <limits.h>
-   #include "mpi.h"
-   #ifdef _MSC_VER
-      typedef __int64            ulong64;
-   #else
-      typedef unsigned long long ulong64;
-   #endif   
-#else   
-   #include "tommath.h"
-#endif
+#include "tommath.h"

 #ifdef TIMER
 ulong64 _tt;
@ -23,19 +8,11 @@ void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
 #endif

-#ifndef DEBUG
-int _ifuncs;
-#else
-extern int _ifuncs;
-extern void dump_timings(void);
-extern void reset_timings(void);
-#endif
-   
 void ndraw(mp_int *a, char *name)
 {
   char buf[4096];
   printf("%s: ", name);
-   mp_toradix(a, buf, 10);
+   mp_toradix(a, buf, 64);
   printf("%s\n", buf);
 }

@ -56,31 +33,13 @@ int lbit(void)
      lfsr <<= 1;
      return 0;
   }
-}   
-     
-#ifdef U_MPI
-int mp_reduce_setup(mp_int *a, mp_int *b)
-{
-   int res;
-   
-   mp_set(a, 1);
-   if ((res = s_mp_lshd(a, b->used * 2)) != MP_OKAY) {
-      return res;
-   }
-   return mp_div(a, b, a, NULL);
 }

-int mp_rand(mp_int *a, int c)
-{
-   long z = abs(rand()) & 65535;
-   mp_set(a, z?z:1);
-   while (c--) {
-      s_mp_lshd(a, 1);
-      mp_add_d(a, abs(rand()), a);
-   }
-   return MP_OKAY;
-}
-#endif
+
+#define DO2(x) x; x;
+#define DO4(x) DO2(x); DO2(x);
+#define DO8(x) DO4(x); DO4(x);
+#define DO(x)  DO8(x); DO8(x);

   char cmd[4096], buf[4096];
 int main(void)
@ -89,12 +48,12 @@ int main(void)
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
                 div2_n, mul2_n;
   unsigned rr;
-   int cnt, ix;
+   int cnt, ix, old_kara_m, old_kara_s;

 #ifdef TIMER
   int n;
   ulong64 tt;
-   FILE *log;
+   FILE *log, *logb;
 #endif

   mp_init(&a);
@ -102,11 +61,11 @@ int main(void)
   mp_init(&c);
   mp_init(&d);
   mp_init(&e);
-   mp_init(&f);
-   
+   mp_init(&f);   
+
 /* test the DR reduction */
 #if 0
-   
+
   srand(time(NULL));
   for (cnt = 2; cnt < 32; cnt++) {
       printf("%d digit modulus\n", cnt);
@ -117,89 +76,103 @@ int main(void)
       }
       a.used = cnt;
       mp_prime_next_prime(&a, 3);
-       
+
       mp_rand(&b, cnt - 1);
       mp_copy(&b, &c);
-   
+
      rr = 0;
      do {
         if (!(rr & 127)) { printf("%9lu\r", rr); fflush(stdout); }
         mp_sqr(&b, &b); mp_add_d(&b, 1, &b);
         mp_copy(&b, &c);
-      
+
         mp_mod(&b, &a, &b);
         mp_dr_reduce(&c, &a, (1<<DIGIT_BIT)-a.dp[0]);
-      
+
         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);
         }
-      } while (++rr < 1000000); 
+      } while (++rr < 1000000);
      printf("Passed DR test for %d digits\n", cnt);
   }
-#endif   
+#endif

 #ifdef TIMER
      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
-goto sqrtime;      

-      log = fopen("add.log", "w");
-      for (cnt = 4; cnt <= 128; cnt += 4) {
+      log = fopen("logs/add.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
-         for (rr = 0; rr < 10000000; rr++) {
-             mp_add(&a, &b, &c);
-         }
+         rr = 0;
+         do { 
+            DO(mp_add(&a,&b,&c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
         tt = rdtsc();
         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
      }
      fclose(log);
- 
-      log = fopen("sub.log", "w");
-      for (cnt = 4; cnt <= 128; cnt += 4) {
+
+      log = fopen("logs/sub.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
-         for (rr = 0; rr < 10000000; rr++) {
-             mp_sub(&a, &b, &c);
-         }
+         rr = 0;
+         do { 
+            DO(mp_sub(&a,&b,&c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
         tt = rdtsc();
         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
      }
      fclose(log);
-      

-sqrtime:   
-   log = fopen("sqr.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      mp_rand(&a, cnt);
-      reset();
-      for (rr = 0; rr < 250000; rr++) {
-          mp_sqr(&a, &b);
-      }
-      tt = rdtsc();
-      printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
-   }
-   fclose(log);
-   
-   log = fopen("mult.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-      reset();
-      for (rr = 0; rr < 250000; rr++) {
-          mp_mul(&a, &b, &c);
-      }
-      tt = rdtsc();
-      printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
-   }
-   fclose(log);
+   /* do mult/square twice, first without karatsuba and second with */
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 2; ix++) {
+      printf("With%s Karatsuba\n", (ix==0)?"out":"");
+
+      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
+
+      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 16) {
+         mp_rand(&a, cnt);
+         reset();
+         rr = 0;
+         do {
+            DO(mp_sqr(&a, &b));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         tt = rdtsc();
+         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      }
+      fclose(log);
+
+      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 16) {
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         reset();
+         rr = 0;
+         do {
+            DO(mp_mul(&a, &b, &c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         tt = rdtsc();
+         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      }
+      fclose(log);
+   }

-expttime:
   {
      char *primes[] = {
         /* DR moduli */
@ -210,7 +183,7 @@ expttime:
         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-         
+
         /* generic unrestricted moduli */
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
@ -219,9 +192,10 @@ expttime:
         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL         
+         NULL
      };
-   log = fopen("expt.log", "w");
+   log = fopen("logs/expt.log", "w");
+   logb = fopen("logs/expt_dr.log", "w");
   for (n = 0; primes[n]; n++) {
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
@ -234,9 +208,11 @@ expttime:
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
      reset();
-      for (rr = 0; rr < 50; rr++) {
-          mp_exptmod(&c, &b, &a, &d);
-      }
+      rr = 0;
+      do {
+         DO(mp_exptmod(&c, &b, &a, &d));
+         rr += 16;
+      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
      tt = rdtsc();
      mp_sub_d(&a, 1, &e);
      mp_sub(&e, &b, &b);
@ -248,25 +224,28 @@ expttime:
         exit(0);
      }
      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf((n < 7) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+   }
   }
-   }   
   fclose(log);
+   fclose(logb);

-   log = fopen("invmod.log", "w");
+   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
      mp_rand(&a, cnt);
      mp_rand(&b, cnt);
-      
+
      do {
         mp_add_d(&b, 1, &b);
         mp_gcd(&a, &b, &c);
      } while (mp_cmp_d(&c, 1) != MP_EQ);
-      
+
      reset();
-      for (rr = 0; rr < 10000; rr++) {
-          mp_invmod(&b, &a, &c);
-      }
+      rr = 0;
+      do {
+         DO(mp_invmod(&b, &a, &c));
+         rr += 16;
+      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
      tt = rdtsc();
      mp_mulmod(&b, &c, &a, &d);
      if (mp_cmp_d(&d, 1) != MP_EQ) {
@ -274,18 +253,18 @@ expttime:
         return 0;
      }
      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
   }
   fclose(log);
-   
+
   return 0;
-  
+
 #endif

-   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = 
+   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0;
+
   for (;;) {
- 
       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
       switch (abs(rand()) % 7) {
           case 0:  mp_clear(&a); mp_init(&a); break;
@ -296,17 +275,17 @@ expttime:
           case 5:  mp_clear(&f); mp_init(&f); break;
           case 6:  break; /* don't clear any */
       }
-   
-   
+
+
       printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n);
       fgets(cmd, 4095, stdin);
       cmd[strlen(cmd)-1] = 0;
       printf("%s  ]\r",cmd); fflush(stdout);
-       if (!strcmp(cmd, "mul2d")) { ++mul2d_n; 
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
+       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+
          mp_mul_2d(&a, rr, &a);
          a.sign = b.sign;
          if (mp_cmp(&a, &b) != MP_EQ) {
@ -315,11 +294,11 @@ expttime:
             draw(&b);
             return 0;
          }
-       } else if (!strcmp(cmd, "div2d")) { ++div2d_n; 
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
+       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
          fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+
          mp_div_2d(&a, rr, &a, &e);
          a.sign = b.sign;
          if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; }
@ -330,19 +309,19 @@ expttime:
             return 0;
          }
       } else if (!strcmp(cmd, "add")) { ++add_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
          mp_copy(&a, &d);
          mp_add(&d, &b, &d);
          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("add %lu failure!\n", add_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("add %lu failure!\n", add_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
-          
+
          /* test the sign/unsigned storage functions */
-          
+
          rr = mp_signed_bin_size(&c);
          mp_to_signed_bin(&c, (unsigned char *)cmd);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
@ -353,8 +332,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
             draw(&d);
             return 0;
          }
-                    
-          
+
+
          rr = mp_unsigned_bin_size(&c);
          mp_to_unsigned_bin(&c, (unsigned char *)cmd);
          memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
@ -367,90 +346,90 @@ draw(&a);draw(&b);draw(&c);draw(&d);
          }

       } else if (!strcmp(cmd, "sub")) { ++sub_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
          mp_copy(&a, &d);
          mp_sub(&d, &b, &d);
          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("sub %lu failure!\n", sub_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("sub %lu failure!\n", sub_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "mul")) { ++mul_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
          mp_copy(&a, &d);
          mp_mul(&d, &b, &d);
          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mul %lu failure!\n", mul_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("mul %lu failure!\n", mul_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "div")) { ++div_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64);
+
          mp_div(&a, &b, &e, &f);
          if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
-             printf("div %lu failure!\n", div_n); 
+             printf("div %lu failure!\n", div_n);
 draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
             return 0;
          }
-          
+
       } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
          mp_copy(&a, &c);
          mp_sqr(&c, &c);
          if (mp_cmp(&b, &c) != MP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n); 
+             printf("sqr %lu failure!\n", sqr_n);
 draw(&a);draw(&b);draw(&c);
             return 0;
          }
       } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
          mp_copy(&a, &d);
          mp_gcd(&d, &b, &d);
          d.sign = c.sign;
          if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n); 
+             printf("gcd %lu failure!\n", gcd_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
             return 0;
          }
       } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
             mp_copy(&a, &d);
             mp_lcm(&d, &b, &d);
             d.sign = c.sign;
             if (mp_cmp(&c, &d) != MP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n); 
+                printf("lcm %lu failure!\n", lcm_n);
   draw(&a);draw(&b);draw(&c);draw(&d);
                return 0;
             }
       } else if (!strcmp(cmd, "expt")) {  ++expt_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 64);
             mp_copy(&a, &e);
             mp_exptmod(&e, &b, &c, &e);
             if (mp_cmp(&d, &e) != MP_EQ) {
-                printf("expt %lu failure!\n", expt_n); 
+                printf("expt %lu failure!\n", expt_n);
   draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
                return 0;
             }
       } else if (!strcmp(cmd, "invmod")) {  ++inv_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
             mp_invmod(&a, &b, &d);
             mp_mulmod(&d,&a,&b,&e);
             if (mp_cmp_d(&e, 1) != MP_EQ) {
@ -460,10 +439,10 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                draw(&e);
                return 0;
             }
-                
+
       } else if (!strcmp(cmd, "div2")) { ++div2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
             mp_div_2(&a, &c);
             if (mp_cmp(&c, &b) != MP_EQ) {
                 printf("div_2 %lu failure\n", div2_n);
@ -473,8 +452,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 return 0;
             }
       } else if (!strcmp(cmd, "mul2")) { ++mul2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
             mp_mul_2(&a, &c);
             if (mp_cmp(&c, &b) != MP_EQ) {
                 printf("mul_2 %lu failure\n", mul2_n);
@ -483,9 +462,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 draw(&c);
                 return 0;
             }
-       }             
-       
+       }
+
   }
-   return 0;   
+   return 0;
 }

--- a/demo/test.c
+++ b/demo/test.c
--- a/etc/makefile
+++ b/etc/makefile
@ -1,23 +1,40 @@
 CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../

-
 # default lib name (requires install with root)
 # LIBNAME=-ltommath

 # libname when you can't install the lib with install
 LIBNAME=../libtommath.a

+#provable primes
 pprime: pprime.o
 	$(CC) pprime.o $(LIBNAME) -o pprime

+# portable [well requires clock()] tuning app
 tune: tune.o
 	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
        
+# spits out mersenne primes
 mersenne: mersenne.o
 	$(CC) mersenne.o $(LIBNAME) -o mersenne

+# fines DR safe primes for the given config
 drprime: drprime.o
 	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
        
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont
--- a/etc/mont.c
+++ b/etc/mont.c
@ -0,0 +1,45 @@
+/* tests the montgomery routines */
+#include <tommath.h>
+
+int main(void)
+{
+   mp_int modulus, R, p, pp;
+   mp_digit mp;
+   long x, y;
+
+   mp_init_multi(&modulus, &R, &p, &pp, NULL);
+
+   /* loop through various sizes */
+   for (x = 4; x < 128; x++) {
+       printf("DIGITS == %3ld...", x); fflush(stdout);
+       
+       /* make up the odd modulus */
+       mp_rand(&modulus, x);
+       modulus.dp[0] |= 1;
+       
+       /* now find the R value */
+       mp_montgomery_calc_normalization(&R, &modulus);
+       mp_montgomery_setup(&modulus, &mp);
+       
+       /* now run through a bunch tests */
+       for (y = 0; y < 100000; y++) {
+           mp_rand(&p, x/2);        /* p = random */
+           mp_mul(&p, &R, &pp);     /* pp = R * p */
+           mp_montgomery_reduce(&pp, &modulus, mp);
+           
+           /* should be equal to p */
+           if (mp_cmp(&pp, &p) != MP_EQ) {
+              printf("FAILURE!\n");
+              exit(-1);
+           }
+       }
+       printf("PASSED\n");
+    }
+    
+    return 0;
+}
+
+
+
+
+
--- a/etc/timer.asm
+++ b/etc/timer.asm
@ -0,0 +1,37 @@
+; x86 timer in NASM
+;
+; Tom St Denis, tomstdenis@iahu.ca
+[bits 32]
+[section .data]
+time dd 0, 0
+
+[section .text]
+
+%ifdef USE_ELF
+[global t_start]
+t_start:
+%else
+[global _t_start]
+_t_start:
+%endif
+   push edx
+   push eax
+   rdtsc
+   mov [time+0],edx
+   mov [time+4],eax
+   pop eax
+   pop edx
+   ret
+   
+%ifdef USE_ELF
+[global t_read]
+t_read:
+%else
+[global _t_read]
+_t_read:
+%endif
+   rdtsc
+   sub eax,[time+4]
+   sbb edx,[time+0]
+   ret
+   
--- a/etc/tune.c
+++ b/etc/tune.c
@ -5,10 +5,21 @@
 #include <tommath.h>
 #include <time.h>

-clock_t
+#ifndef X86_TIMER
+
+/* generic ISO C timer */
+unsigned long long __T;
+void t_start(void) { __T = clock(); }
+unsigned long long t_read(void) { return clock() - __T; }
+
+#else
+extern void t_start(void);
+extern unsigned long long t_read(void);
+#endif
+
+unsigned long long
 time_mult (void)
 {
-  clock_t t1;
  int     x, y;
  mp_int  a, b, c;

@ -16,137 +27,83 @@ time_mult (void)
  mp_init (&b);
  mp_init (&c);

-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
+  t_start();
+  for (x = 32; x <= 288; x += 4) {
    mp_rand (&a, x);
    mp_rand (&b, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 100; y++) {
      mp_mul (&a, &b, &c);
    }
  }
  mp_clear (&a);
  mp_clear (&b);
  mp_clear (&c);
-  return clock () - t1;
+  return t_read();
 }

-clock_t
+unsigned long long
 time_sqr (void)
 {
-  clock_t t1;
  int     x, y;
  mp_int  a, b;

  mp_init (&a);
  mp_init (&b);

-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
+  t_start();
+  for (x = 32; x <= 288; x += 4) {
    mp_rand (&a, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 100; y++) {
      mp_sqr (&a, &b);
    }
  }
  mp_clear (&a);
  mp_clear (&b);
-  return clock () - t1;
-}
-
-clock_t
-time_expt (void)
-{
-  clock_t t1;
-  int     x, y;
-  mp_int  a, b, c, d;
-
-  mp_init (&a);
-  mp_init (&b);
-  mp_init (&c);
-  mp_init (&d);
-
-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
-    mp_rand (&a, x);
-    mp_rand (&b, x);
-    mp_rand (&c, x);
-    if (mp_iseven (&c) != 0) {
-      mp_add_d (&c, 1, &c);
-    }
-    for (y = 0; y < 10; y++) {
-      mp_exptmod (&a, &b, &c, &d);
-    }
-  }
-  mp_clear (&d);
-  mp_clear (&c);
-  mp_clear (&b);
-  mp_clear (&a);
-
-  return clock () - t1;
+  return t_read();
 }

 int
 main (void)
 {
-  int     best_mult, best_square, best_exptmod;
-  clock_t best, ti;
+  int     best_mult, best_square;
+  unsigned long long best, ti;
  FILE   *log;

-  best_mult = best_square = best_exptmod = 0;
-
+  best_mult = best_square = 0;
  /* tune multiplication first */
  log = fopen ("mult.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 144; KARATSUBA_MUL_CUTOFF++) {
+  best = -1;
+  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) {
    ti = time_mult ();
-    printf ("%4d : %9lu\r", KARATSUBA_MUL_CUTOFF, ti);
-    fprintf (log, "%d, %lu\n", KARATSUBA_MUL_CUTOFF, ti);
+    printf ("%4d : %9llu\r", KARATSUBA_MUL_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
-      printf ("New best: %lu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
+      printf ("New best: %llu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
      best = ti;
      best_mult = KARATSUBA_MUL_CUTOFF;
    }
  }
  fclose (log);
-
  /* tune squaring */
  log = fopen ("sqr.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 144; KARATSUBA_SQR_CUTOFF++) {
+  best = -1;
+  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) {
    ti = time_sqr ();
-    printf ("%4d : %9lu\r", KARATSUBA_SQR_CUTOFF, ti);
-    fprintf (log, "%d, %lu\n", KARATSUBA_SQR_CUTOFF, ti);
+    printf ("%4d : %9llu\r", KARATSUBA_SQR_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
-      printf ("New best: %lu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
+      printf ("New best: %llu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
      best = ti;
      best_square = KARATSUBA_SQR_CUTOFF;
    }
  }
  fclose (log);

-  /* tune exptmod */
-  KARATSUBA_MUL_CUTOFF = best_mult;
-  KARATSUBA_SQR_CUTOFF = best_square;
-
-  log = fopen ("expt.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (MONTGOMERY_EXPT_CUTOFF = 8; MONTGOMERY_EXPT_CUTOFF <= 144; MONTGOMERY_EXPT_CUTOFF++) {
-    ti = time_expt ();
-    printf ("%4d : %9lu\r", MONTGOMERY_EXPT_CUTOFF, ti);
-    fflush (stdout);
-    fprintf (log, "%d : %lu\r", MONTGOMERY_EXPT_CUTOFF, ti);
-    if (ti < best) {
-      printf ("New best: %lu, %d\n", ti, MONTGOMERY_EXPT_CUTOFF);
-      best = ti;
-      best_exptmod = MONTGOMERY_EXPT_CUTOFF;
-    }
-  }
-  fclose (log);
-
  printf
-    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\nMontgomery exptmod Cutoff: %d\n",
-     best_mult, best_square, best_exptmod);
+    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n",
+     best_mult, best_square);

  return 0;
 }
--- a/gen.pl
+++ b/gen.pl
@ -1,27 +1,18 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
 #
-#Generates a "single file" you can use to quickly add the whole source 
-#without any makefile troubles
+# Generates a "single file" you can use to quickly
+# add the whole source without any makefile troubles
 #
+use strict;

-opendir(DIR,".");
-@files = readdir(DIR);
-closedir(DIR);
-
-open(OUT,">mpi.c");
-print OUT "/* File Generated Automatically by gen.pl */\n\n";
-for (@files) {
-   if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
-      $fname = $_;
-      open(SRC,"<$fname");
-      print OUT "/* Start: $fname */\n";
-      while (<SRC>) {
-         print OUT $_;
-      }
-      close(SRC);
-      print OUT "\n/* End: $fname */\n\n";
-   }
+open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
+foreach my $filename (glob "bn_*.c") {
+   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+   print OUT "/* Start: $filename */\n";
+   print OUT qq[#line 0 "$filename"\n];
+   print OUT while <SRC>;
+   print OUT "\n/* End: $filename */\n\n";
+   close SRC or die "Error closing $filename after reading: $!";
 }
-print OUT "\n/* EOF */\n";
-close(OUT);
-   
+print OUT "\b/* EOF */\n";
+close OUT or die "Error closing mpi.c after writing: $!";
--- a/logs/README
+++ b/logs/README
@ -0,0 +1,13 @@
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
+Tom
--- a/logs/add.log
+++ b/logs/add.log
@ -0,0 +1,16 @@
+224  11039864
+448   9206336
+672   8178200
+896   7432176
+1120   6433264
+1344   5847056
+1568   5270184
+1792   4943416
+2016   4520016
+2240   4256168
+2464   3999224
+2688   3714896
+2912   3572720
+3136   3340176
+3360   3222584
+3584   3036336
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -0,0 +1,7 @@
+14364       666
+21532       253
+28700       117
+57372        17
+71708         9
+86044         5
+114716         2
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -0,0 +1,7 @@
+14896      1088
+21952       468
+29008       244
+43120        91
+58016        43
+86240        15
+115248         6
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -0,0 +1,17 @@
+set terminal png color
+set size 1.5
+set ylabel "Operations per Second"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
--- a/logs/index.html
+++ b/logs/index.html
@ -0,0 +1,24 @@
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
+</html>
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -0,0 +1,32 @@
+112     15608
+224      7840
+336      5104
+448      3376
+560      2616
+672      1984
+784      1640
+896      2056
+1008      1136
+1120       936
+1232      1240
+1344      1112
+1456       608
+1568       873
+1680       492
+1792       444
+1904       640
+2016       584
+2128       328
+2240       307
+2352       283
+2464       256
+2576       393
+2688       365
+2800       344
+2912       196
+3024       301
+3136       170
+3248       160
+3360       250
+3472       144
+3584       224
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/mult.log
+++ b/logs/mult.log
@ -0,0 +1,17 @@
+896    321504
+1344    150784
+1792     90288
+2240     59760
+2688     42480
+3136     32056
+3584     24600
+4032     19656
+4480     16024
+4928     13328
+5376     11280
+5824      9624
+6272      8336
+6720      7280
+7168      1648
+7616      1464
+8064      1296
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -0,0 +1,17 @@
+896    321928
+1344    150752
+1792     90136
+2240     59888
+2688     42480
+3136     32080
+3584     25744
+4032     21216
+4480     17912
+4928     14896
+5376     12936
+5824     11216
+6272      9848
+6720      8896
+7168      7968
+7616      7248
+8064      6600
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -0,0 +1,17 @@
+896    416968
+1344    223672
+1792    141552
+2240     97280
+2688     71304
+3136     54648
+3584     16264
+4032     13000
+4480     10528
+4928      8776
+5376      7464
+5824      6440
+6272      5520
+6720      4808
+7168      4264
+7616      3784
+8064      3368
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -0,0 +1,17 @@
+896    416656
+1344    223728
+1792    141288
+2240     97456
+2688     71152
+3136     54392
+3584     38552
+4032     32216
+4480     27384
+4928     23792
+5376     20728
+5824     18232
+6272     16160
+6720     14408
+7168     11696
+7616     10768
+8064      9920
--- a/logs/sub.log
+++ b/logs/sub.log
@ -0,0 +1,16 @@
+224   9862520
+448   8562344
+672   7661400
+896   6838128
+1120   5911144
+1344   5394040
+1568   4993760
+1792   4624240
+2016   4332024
+2240   4029312
+2464   3790784
+2688   3587216
+2912   3397952
+3136   3239736
+3360   3080616
+3584   2933104
--- a/44
+++ b/44
@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops

-VERSION=0.16
+VERSION=0.17

 default: libtommath.a

@ -32,7 +32,8 @@ bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_un
 bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
-bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o 
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o

 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -52,21 +53,46 @@ test: libtommath.a demo/demo.o
        
 timing: libtommath.a
 	$(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s
-	$(CC) $(CFLAGS) -DTIMER -DU_MPI -I./mtest/ demo/demo.c mtest/mpi.c -o mpitest -s

-docdvi: bn.tex
-	latex bn
+# makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
+docdvi: tommath.src
+	cd pics ; make 
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+		
+# makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
+docs:	
+	cd pics ; make pdfes
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+	dvips -tB5 -D600 tommath
+	echo "hello" > tommath.ind
+	perl booker.pl PDF
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+	pdflatex tommath
+	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
 	
-docs:	docdvi
+#the old manual being phased out
+manual:	
+	latex bn
 	pdflatex bn
-	rm -f bn.log bn.aux bn.dvi
+	rm -f bn.aux bn.dvi bn.log 	
 	
 clean:
 	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        bn.log bn.aux bn.dvi *.log *.s mpi.c 
+        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c 
 	cd etc ; make clean
+	cd pics ; make clean

-zipup: clean docs
+zipup: clean manual
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
--- a/makefile.msvc
+++ b/makefile.msvc
@ -22,7 +22,8 @@ bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp
 bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
-bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj


 library: $(OBJECTS)
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -10,7 +10,7 @@ result1
 result2
 [... resultN]

-So for example "a * b mod n" would be 
+So for example "a * b mod n" would be

 mulmod
 a
@ -18,7 +18,7 @@ b
 n
 a*b mod n

-e.g. if a=3, b=4 n=11 then 
+e.g. if a=3, b=4 n=11 then

 mulmod
 3
@ -38,10 +38,10 @@ FILE *rng;
 void rand_num(mp_int *a)
 {
   int n, size;
-   unsigned char buf[512];
+   unsigned char buf[2048];

 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 1024);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   for (n = 0; n < size; n++) {
@ -54,7 +54,7 @@ top:
 void rand_num2(mp_int *a)
 {
   int n, size;
-   unsigned char buf[512];
+   unsigned char buf[2048];

 top:
   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
@ -67,18 +67,38 @@ top:
   mp_read_raw(a, buf, 1+size);
 }

+#define mp_to64(a, b) mp_toradix(a, b, 64)
+
 int main(void)
 {
   int n;
   mp_int a, b, c, d, e;
   char buf[4096];
-   
+
   mp_init(&a);
   mp_init(&b);
   mp_init(&c);
   mp_init(&d);
   mp_init(&e);

+
+   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
+/*
+   mp_set(&a, 1);
+   for (n = 1; n < 8192; n++) {
+       mp_mul(&a, &a, &c);
+       printf("mul\n");
+       mp_to64(&a, buf);
+       printf("%s\n%s\n", buf, buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+
+       mp_add_d(&a, 1, &a);
+       mp_mul_2(&a, &a);
+       mp_sub_d(&a, 1, &a);
+   }
+*/
+
   rng = fopen("/dev/urandom", "rb");
   if (rng == NULL) {
      rng = fopen("/dev/random", "rb");
@ -97,11 +117,11 @@ int main(void)
       rand_num(&b);
       mp_add(&a, &b, &c);
       printf("add\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 1) {
      /* sub tests */
@ -109,11 +129,11 @@ int main(void)
       rand_num(&b);
       mp_sub(&a, &b, &c);
       printf("sub\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 2) {
       /* mul tests */
@ -121,11 +141,11 @@ int main(void)
       rand_num(&b);
       mp_mul(&a, &b, &c);
       printf("mul\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
       printf("%s\n", buf);
   } else if (n == 3) {
      /* div tests */
@ -133,22 +153,22 @@ int main(void)
       rand_num(&b);
       mp_div(&a, &b, &c, &d);
       printf("div\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
       printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
       printf("%s\n", buf);
-       mp_todecimal(&d, buf);
+       mp_to64(&d, buf);
       printf("%s\n", buf);
   } else if (n == 4) {
      /* sqr tests */
       rand_num(&a);
       mp_sqr(&a, &b);
       printf("sqr\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
       printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
       printf("%s\n", buf);
   } else if (n == 5) {
      /* mul_2d test */
@ -156,11 +176,11 @@ int main(void)
      mp_copy(&a, &b);
      n = fgetc(rng) & 63;
      mp_mul_2d(&b, n, &b);
-      mp_todecimal(&a, buf);
+      mp_to64(&a, buf);
      printf("mul2d\n");
      printf("%s\n", buf);
      printf("%d\n", n);
-      mp_todecimal(&b, buf);
+      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 6) {
      /* div_2d test */
@ -168,11 +188,11 @@ int main(void)
      mp_copy(&a, &b);
      n = fgetc(rng) & 63;
      mp_div_2d(&b, n, &b, NULL);
-      mp_todecimal(&a, buf);
+      mp_to64(&a, buf);
      printf("div2d\n");
      printf("%s\n", buf);
      printf("%d\n", n);
-      mp_todecimal(&b, buf);
+      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 7) {
      /* gcd test */
@ -182,12 +202,12 @@ int main(void)
      b.sign = MP_ZPOS;
      mp_gcd(&a, &b, &c);
      printf("gcd\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
   } else if (n == 8) {
      /* lcm test */
      rand_num(&a);
@ -196,12 +216,12 @@ int main(void)
      b.sign = MP_ZPOS;
      mp_lcm(&a, &b, &c);
      printf("lcm\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
   } else if (n == 9) {
      /* exptmod test */
      rand_num2(&a);
@ -210,14 +230,14 @@ int main(void)
      a.sign = b.sign = c.sign = 0;
      mp_exptmod(&a, &b, &c, &d);
      printf("expt\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&d, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+      mp_to64(&d, buf);
+      printf("%s\n", buf);
   } else if (n == 10) {
      /* invmod test */
      rand_num2(&a);
@ -229,28 +249,28 @@ int main(void)
      if (mp_cmp_d(&b, 1) == 0) continue;
      mp_invmod(&a, &b, &c);
      printf("invmod\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
   } else if (n == 11) {
      rand_num(&a);
      mp_mul_2(&a, &a);
      mp_div_2(&a, &b);
      printf("div2\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
      printf("%s\n", buf);
   } else if (n == 12) {
      rand_num2(&a);
      mp_mul_2(&a, &b);
      printf("mul2\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
      printf("%s\n", buf);
   }
   }
--- a/pics/makefile
+++ b/pics/makefile
@ -0,0 +1,17 @@
+# makes the images... yeah
+
+default:  pses
+
+
+sliding_window.ps: sliding_window.tif
+	tiff2ps -c -e sliding_window.tif > sliding_window.ps
+
+sliding_window.pdf: sliding_window.ps
+	epstopdf sliding_window.ps
+
+pses: sliding_window.ps 
+pdfes: sliding_window.pdf
+
+clean:
+	rm -rf *.ps *.pdf .xvpics
+   
--- a/pics/sliding_window.TIF
+++ b/pics/sliding_window.TIF
--- a/pics/sliding_window.sxd
+++ b/pics/sliding_window.sxd
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
--- a/tommath.h
+++ b/tommath.h
@ -1,11 +1,11 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
- * LibTomMath is library that provides for multiple-precision 
+ * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
- * 
+ *
 * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.  
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
@ -34,18 +34,18 @@ extern "C" {

 #else

-/* C on the other hand dosen't care */
-#define  OPT_CAST  
+/* C on the other hand doesn't care */
+#define  OPT_CAST

 #endif

-/* some default configurations.  
+/* some default configurations.
 *
- * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits 
- * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits 
+ * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
+ * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
 *
- * At the very least a mp_digit must be able to hold 7 bits 
- * [any size beyond that is ok provided it overflow the data type]
+ * At the very least a mp_digit must be able to hold 7 bits
+ * [any size beyond that is ok provided it doesn't overflow the data type]
 */
 #ifdef MP_8BIT
   typedef unsigned char      mp_digit;
@ -53,7 +53,21 @@ extern "C" {
 #elif defined(MP_16BIT)
   typedef unsigned short     mp_digit;
   typedef unsigned long      mp_word;
+#elif defined(MP_64BIT)
+   /* for GCC only on supported platforms */
+#ifndef CRYPT
+   typedef unsigned long long ulong64;
+   typedef signed long long   long64;
+#endif
+
+   typedef ulong64            mp_digit;
+   typedef unsigned long      mp_word __attribute__ ((mode(TI)));
+
+   #define DIGIT_BIT          60
 #else
+   /* this is the default case, 28-bit digits */
+   
+   /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
   #ifdef _MSC_VER
      typedef unsigned __int64   ulong64;
@ -61,23 +75,24 @@ extern "C" {
   #else
      typedef unsigned long long ulong64;
      typedef signed long long   long64;
-   #endif   
-#endif   
+   #endif
+#endif

-   /* default case */
   typedef unsigned long      mp_digit;
   typedef ulong64            mp_word;
-  
-   #define DIGIT_BIT          28
-#endif  

+   #define DIGIT_BIT          28
+#endif
+
+/* otherwise the bits per digit is calculated automatically from the size of a mp_digit */
 #ifndef DIGIT_BIT
   #define DIGIT_BIT     ((CHAR_BIT * sizeof(mp_digit) - 1))  /* bits per digit */
 #endif

+
 #define MP_DIGIT_BIT     DIGIT_BIT
 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
-#define MP_DIGIT_MAX     MP_MASK   
+#define MP_DIGIT_MAX     MP_MASK

 /* equalities */
 #define MP_LT        -1   /* less than */
@ -99,7 +114,14 @@ extern int KARATSUBA_MUL_CUTOFF,
           KARATSUBA_SQR_CUTOFF,
           MONTGOMERY_EXPT_CUTOFF;

-#define MP_PREC                 64      /* default digits of precision */
+/* various build options */
+#define MP_PREC                 64      /* default digits of precision (must be power of two) */
+
+/* define this to use lower memory usage routines (exptmods mostly) */
+/* #define MP_LOW_MEM */
+
+/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
+#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))

 typedef struct  {
    int used, alloc, sign;
@ -118,6 +140,12 @@ int mp_init(mp_int *a);
 /* free a bignum */
 void mp_clear(mp_int *a);

+/* init a null terminated series of arguments */
+int mp_init_multi(mp_int *mp, ...);
+
+/* clear a null terminated series of arguments */
+void mp_clear_multi(mp_int *mp, ...);
+
 /* exchange two ints */
 void mp_exch(mp_int *a, mp_int *b);

@ -143,7 +171,7 @@ void mp_zero(mp_int *a);
 void mp_set(mp_int *a, mp_digit b);

 /* set a 32-bit const */
-int mp_set_int(mp_int *a, unsigned long b);
+int mp_set_int(mp_int *a, unsigned int b);

 /* copy, b = a */
 int mp_copy(mp_int *a, mp_int *b);
@ -162,22 +190,22 @@ void mp_rshd(mp_int *a, int b);
 /* left shift by "b" digits */
 int mp_lshd(mp_int *a, int b);

-/* c = a / 2^b */
+/* c = a / 2**b */
 int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);

 /* b = a/2 */
 int mp_div_2(mp_int *a, mp_int *b);

-/* c = a * 2^b */
+/* c = a * 2**b */
 int mp_mul_2d(mp_int *a, int b, mp_int *c);

 /* b = a*2 */
 int mp_mul_2(mp_int *a, mp_int *b);

-/* c = a mod 2^d */
+/* c = a mod 2**d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);

-/* computes a = 2^b */
+/* computes a = 2**b */
 int mp_2expt(mp_int *a, int b);

 /* makes a pseudo-random int of a given size */
@ -216,7 +244,7 @@ int mp_sub(mp_int *a, mp_int *b, mp_int *c);
 /* c = a * b */
 int mp_mul(mp_int *a, mp_int *b, mp_int *c);

-/* b = a^2 */
+/* b = a*a  */
 int mp_sqr(mp_int *a, mp_int *b);

 /* a/b => cb + d == a */
@ -242,7 +270,7 @@ int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
 /* a/b => cb + d == a */
 int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);

-/* c = a^b */
+/* c = a**b */
 int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);

 /* c = a mod b, 0 <= c < b  */
@ -271,7 +299,7 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);

-/* finds one of the b'th root of a, such that |c|^b <= |a| 
+/* finds one of the b'th root of a, such that |c|**b <= |a|
 *
 * returns error if a < 0 and b is even
 */
@ -288,7 +316,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b);

 /* Barrett Reduction, computes a (mod b) with a precomputed value c
 *
- * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
+ * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
 * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
 */
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
@ -296,12 +324,12 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);

-/* computes a = B^n mod b without division or multiplication useful for 
+/* computes a = B**n mod b without division or multiplication useful for
 * normalizing numbers in a Montgomery system.
 */
 int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);

-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+/* computes x/R == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);

 /* returns 1 if a is a valid DR modulus */
@ -313,32 +341,38 @@ void mp_dr_setup(mp_int *a, mp_digit *d);
 /* reduces a modulo b using the Diminished Radix method */
 int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);

-/* d = a^b (mod c) */
+/* d = a**b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);

 /* ---> Primes <--- */
-#define PRIME_SIZE	256	/* number of primes */

-/* table of first 256 primes */
+/* number of primes */
+#ifdef MP_8BIT
+   #define PRIME_SIZE      31
+#else
+   #define PRIME_SIZE      256
+#endif
+
+/* table of first PRIME_SIZE primes */
 extern const mp_digit __prime_tab[];

-/* result=1 if a is divisible by one of the first 256 primes */
+/* result=1 if a is divisible by one of the first PRIME_SIZE primes */
 int mp_prime_is_divisible(mp_int *a, int *result);

-/* performs one Fermat test of "a" using base "b".  
- * Sets result to 0 if composite or 1 if probable prime 
+/* performs one Fermat test of "a" using base "b".
+ * Sets result to 0 if composite or 1 if probable prime
 */
 int mp_prime_fermat(mp_int *a, mp_int *b, int *result);

 /* performs one Miller-Rabin test of "a" using base "b".
- * Sets result to 0 if composite or 1 if probable prime 
+ * Sets result to 0 if composite or 1 if probable prime
 */
 int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result);

 /* performs t rounds of Miller-Rabin on "a" using the first
 * t prime bases.  Also performs an initial sieve of trial
 * division.  Determines if "a" is prime with probability
- * of error no more than (1/4)^t.
+ * of error no more than (1/4)**t.
 *
 * Sets result to 1 if probably prime, 0 otherwise
 */
@ -365,6 +399,9 @@ int mp_read_radix(mp_int *a, char *str, int radix);
 int mp_toradix(mp_int *a, char *str, int radix);
 int mp_radix_size(mp_int *a, int radix);

+int mp_fread(mp_int *a, int radix, FILE *stream);
+int mp_fwrite(mp_int *a, int radix, FILE *stream);
+
 #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
 #define mp_raw_size(mp)           mp_signed_bin_size(mp)
 #define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
--- a/tommath.src
+++ b/tommath.src
--- a/tommath.tex
+++ b/tommath.tex