added libtommath-0.18

2003-05-29 13:35:26 +00:00 · 2003-05-29 13:35:26 +00:00 · 0ef44cea9b
commit 0ef44cea9b
parent fd181cc841
108 changed files with 9777 additions and 2427 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}
-\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.18 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>
-/* computes xR^-1 == x (mod N) via Montgomery Reduction 
+/* computes xR**-1 == x (mod N) via Montgomery Reduction 
 * 
 * This is an optimized implementation of mp_montgomery_reduce 
 * which uses the comba method to quickly calculate the columns of the
@ -23,76 +23,77 @@
 * Based on Algorithm 14.32 on pp.601 of HAC.
 */
 int
-fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
  int     ix, res, olduse;
  mp_word W[MP_WARRAY];
  /* get old used count */
-  olduse = a->used;
+  olduse = x->used;
  /* grow a as required */
-  if (a->alloc < m->used + 1) {
+  if (x->alloc < n->used + 1) {
-    if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
      return res;
    }
  }
  {
    register mp_word *_W;
-    register mp_digit *tmpa;
+    register mp_digit *tmpx;
    _W = W;
-    tmpa = a->dp;
+    tmpx = x->dp;
    /* copy the digits of a into W[0..a->used-1] */
-    for (ix = 0; ix < a->used; ix++) {
+    for (ix = 0; ix < x->used; ix++) {
-      *_W++ = *tmpa++;
+      *_W++ = *tmpx++;
    }
    /* zero the high words of W[a->used..m->used*2] */
-    for (; ix < m->used * 2 + 1; ix++) {
+    for (; ix < n->used * 2 + 1; ix++) {
      *_W++ = 0;
    }
  }
-  for (ix = 0; ix < m->used; ix++) {
+  for (ix = 0; ix < n->used; ix++) {
-    /* ui = ai * m' mod b
+    /* mu = ai * m' mod b
     *
     * We avoid a double precision multiplication (which isn't required)
-     * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+     * by casting the value down to a mp_digit.  Note this requires 
-     * the carry cleared (see after the inner loop)
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
     */
-    register mp_digit ui;
+    register mp_digit mu;
-    ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK;
+    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
-    /* a = a + ui * m * b^i
+    /* a = a + mu * m * b**i
     *
     * This is computed in place and on the fly.  The multiplication
-     * by b^i is handled by offseting which columns the results
+     * by b**i is handled by offseting which columns the results
     * are added to.
     *
-     * Note the comba method normally doesn't handle carries in the inner loop
+     * Note the comba method normally doesn't handle carries in the 
-     * In this case we fix the carry from the previous column since the Montgomery
+     * inner loop In this case we fix the carry from the previous 
-     * reduction requires digits of the result (so far) [see above] to work.  This is
+     * column since the Montgomery reduction requires digits of the 
-     * handled by fixing up one carry after the inner loop.  The carry fixups are done
+     * result (so far) [see above] to work.  This is
-     * in order so after these loops the first m->used words of W[] have the carries
+     * handled by fixing up one carry after the inner loop.  The 
-     * fixed
+     * carry fixups are done in order so after these loops the 
     * first m->used words of W[] have the carries fixed
     */
    {
      register int iy;
-      register mp_digit *tmpx;
+      register mp_digit *tmpn;
      register mp_word *_W;
      /* alias for the digits of the modulus */
-      tmpx = m->dp;
+      tmpn = n->dp;
      /* Alias for the columns set by an offset of ix */
      _W = W + ix;
      /* inner loop */
-      for (iy = 0; iy < m->used; iy++) {
+      for (iy = 0; iy < n->used; iy++) {
-    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
      }
    }
@ -102,44 +103,44 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
  {
-    register mp_digit *tmpa;
+    register mp_digit *tmpx;
    register mp_word *_W, *_W1;
    /* nox fix rest of carries */
    _W1 = W + ix;
    _W = W + ++ix;
-    for (; ix <= m->used * 2 + 1; ix++) {
+    for (; ix <= n->used * 2 + 1; ix++) {
      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
    }
-    /* copy out, A = A/b^n
+    /* copy out, A = A/b**n
     *
-     * The result is A/b^n but instead of converting from an array of mp_word
+     * The result is A/b**n but instead of converting from an 
-     * to mp_digit than calling mp_rshd we just copy them in the right
+     * array of mp_word to mp_digit than calling mp_rshd 
-     * order
+     * we just copy them in the right order
     */
-    tmpa = a->dp;
+    tmpx = x->dp;
-    _W = W + m->used;
+    _W = W + n->used;
-    for (ix = 0; ix < m->used + 1; ix++) {
+    for (ix = 0; ix < n->used + 1; ix++) {
-      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+      *tmpx++ = *_W++ & ((mp_word) MP_MASK);
    }
    /* zero oldused digits, if the input a was larger than
     * m->used+1 we'll have to clear the digits */
    for (; ix < olduse; ix++) {
-      *tmpa++ = 0;
+      *tmpx++ = 0;
    }
  }
  /* set the max used and clamp */
-  a->used = m->used + 1;
+  x->used = n->used + 1;
-  mp_clamp (a);
+  mp_clamp (x);
  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
+  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (a, m, a);
+    return s_mp_sub (x, n, x);
  }
  return MP_OKAY;
 }
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@ -16,15 +16,17 @@
 /* fast squaring
 *
- * This is the comba method where the columns of the product are computed first
+ * This is the comba method where the columns of the product 
- * then the carries are computed.  This has the effect of making a very simple
+ * are computed first then the carries are computed.  This 
- * inner loop that is executed the most
+ * has the effect of making a very simple inner loop that 
 * is executed the most
 *
 * W2 represents the outer products and W the inner.
 *
- * A further optimizations is made because the inner products are of the form
+ * A further optimizations is made because the inner 
- * "A * B * 2".  The *2 part does not need to be computed until the end which is
+ * products are of the form "A * B * 2".  The *2 part does 
- * good because 64-bit shifts are slow!
+ * not need to be computed until the end which is good 
 * because 64-bit shifts are slow!
 *
 * Based on Algorithm 14.16 on pp.597 of HAC.
 *
@ -48,26 +50,15 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
   * Note that there are two buffers.  Since squaring requires
   * a outter and inner product and the inner product requires
   * computing a product and doubling it (a relatively expensive
-   * op to perform n^2 times if you don't have to) the inner and
+   * op to perform n**2 times if you don't have to) the inner and
   * outer products are computed in different buffers.  This way
   * the inner product can be doubled using n doublings instead of
-   * n^2
+   * n**2
   */
  memset (W, 0, newused * sizeof (mp_word));
  memset (W2, 0, newused * sizeof (mp_word));
-/* note optimization
+  /* This computes the inner product.  To simplify the inner N**2 loop
 * values in W2 are only written in even locations which means
 * we can collapse the array to 256 words [and fixup the memset above]
 * provided we also fix up the summations below.  Ideally
 * the fixup loop should be unrolled twice to handle the even/odd
 * cases, and then a final step to handle odd cases [e.g. newused == odd]
 *
 * This will not only save ~8*256 = 2KB of stack but lower the number of
 * operations required to finally fix up the columns
 */
  /* This computes the inner product.  To simplify the inner N^2 loop
   * the multiplication by two is done afterwards in the N loop.
   */
  for (ix = 0; ix < pa; ix++) {
@ -101,18 +92,19 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
  }
  /* setup dest */
-  olduse = b->used;
+  olduse  = b->used;
  b->used = newused;
  /* double first value, since the inner products are half of what they should be */
  W[0] += W[0] + W2[0];
  /* now compute digits */
  {
    register mp_digit *tmpb;
-    tmpb = b->dp;
+    /* double first value, since the inner products are 
     * half of what they should be 
     */
    W[0] += W[0] + W2[0];
    tmpb = b->dp;
    for (ix = 1; ix < newused; ix++) {
      /* double/add next digit */
      W[ix] += W[ix] + W2[ix];
@ -120,9 +112,13 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
    }
    /* set the last value.  Note even if the carry is zero 
     * this is required since the next step will not zero 
     * it if b originally had a value at b->dp[2*a.used]
     */
    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-    /* clear high */
+    /* clear high digits */
    for (; ix < olduse; ix++) {
      *tmpb++ = 0;
    }
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>
-/* computes a = 2^b 
+/* computes a = 2**b 
 *
 * Simple algorithm which zeroes the int, grows it then just sets one bit
 * as required.
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@ -21,7 +21,7 @@ mp_copy (mp_int * a, mp_int * b)
  int     res, n;
  /* if dst == src do nothing */
-  if (a == b || a->dp == b->dp) {
+  if (a == b) {
    return MP_OKAY;
  }
--- a/bn_mp_count_bits.c
+++ b/bn_mp_count_bits.c
@ -21,11 +21,15 @@ mp_count_bits (mp_int * a)
  int     r;
  mp_digit q;
  /* shortcut */
  if (a->used == 0) {
    return 0;
  }
  /* get number of digits and add that */
  r = (a->used - 1) * DIGIT_BIT;
  /* take the last digit and count the bits in it */
  q = a->dp[a->used - 1];
  while (q > ((mp_digit) 0)) {
    ++r;
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>
-/* shift right by a certain bit count (store quotient in c, remainder in d) */
+/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
 int
 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 {
@ -81,7 +81,6 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
    }
  }
  mp_clamp (c);
  res = MP_OKAY;
  if (d != NULL) {
    mp_exch (&t, d);
  }
--- a/bn_mp_div_3.c
+++ b/bn_mp_div_3.c
@ -0,0 +1,64 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* divide by three (based on routine from MPI and the GMP manual) */
 int
 mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
 {
  mp_int   q;
  mp_word  w, t;
  mp_digit b;
  int      res, ix;
  /* b = 2**DIGIT_BIT / 3 */
  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
     return res;
  }
  q.used = a->used;
  q.sign = a->sign;
  w = 0;
  for (ix = a->used - 1; ix >= 0; ix--) {
     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
     if (w >= 3) {
        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
        w -= (t << ((mp_word)1)) + t;
        while (w >= 3) {
           t += 1;
           w -= 3;
        }
      } else {
        t = 0;
      }
      q.dp[ix] = t;
  }
  if (d != NULL) {
     *d = w;
  }
  if (c != NULL) {
     mp_clamp(&q);
     mp_exch(&q, c);
  }
  mp_clear(&q);
  return res;
 }
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@ -14,31 +14,51 @@
 */
 #include <tommath.h>
-/* single digit division */
+/* single digit division (based on routine from MPI) */
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 {
-  mp_int  t, t2;
+  mp_int  q;
-  int     res;
+  mp_word w, t;
-
+  int     res, ix;
-  if ((res = mp_init (&t)) != MP_OKAY) {
+  
-    return res;
+  if (b == 0) {
     return MP_VAL;
  }
-
+  
-  if ((res = mp_init (&t2)) != MP_OKAY) {
+  if (b == 3) {
-    mp_clear (&t);
+     return mp_div_3(a, c, d);
    return res;
  }
-
+  
-  mp_set (&t, b);
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
-  res = mp_div (a, &t, c, &t2);
+     return res;
-
+  }
-  /* set remainder if not null */
+  
  q.used = a->used;
  q.sign = a->sign;
  w = 0;
  for (ix = a->used - 1; ix >= 0; ix--) {
     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
     if (w >= b) {
        t = w / b;
        w = w % b;
      } else {
        t = 0;
      }
      q.dp[ix] = t;
  }
  if (d != NULL) {
-    *d = t2.dp[0];
+     *d = w;
  }
-
+  
-  mp_clear (&t);
+  if (c != NULL) {
-  mp_clear (&t2);
+     mp_clamp(&q);
     mp_exch(&q, c);
  }
  mp_clear(&q);
  return res;
 }
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>
-/* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
+/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
 *
 * Based on algorithm from the paper
 *
@ -23,107 +23,64 @@
 *          POSTECH Information Research Laboratories
 *
 * The modulus must be of a special format [see manual]
 *
 * Has been modified to use algorithm 7.10 from the LTM book instead
 */
 int
-mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
+mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
 {
-  int     err, i, j, k;
+  int      err, i, m;
-  mp_word r;
+  mp_word  r;
-  mp_digit mu, *tmpj, *tmpi;
+  mp_digit mu, *tmpx1, *tmpx2;
-
+  
-  /* k = digits in modulus */
+  /* m = digits in modulus */
-  k = b->used;
+  m = n->used;
-
+  
-  /* ensure that "a" has at least 2k digits */
+  /* ensure that "x" has at least 2m digits */
-  if (a->alloc < k + k) {
+  if (x->alloc < m + m) {
-    if ((err = mp_grow (a, k + k)) != MP_OKAY) {
+    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
      return err;
    }
  }
-  /* alias for a->dp[i] */
+/* top of loop, this is where the code resumes if 
-  tmpi = a->dp + k + k - 1;
+ * another reduction pass is required.
-
+ */
-  /* for (i = 2k - 1; i >= k; i = i - 1)
+top:
-   *
+  /* aliases for digits */
-   * This is the main loop of the reduction.  Note that at the end
+  /* alias for lower half of x */
-   * the words above position k are not zeroed as expected.  The end
+  tmpx1 = x->dp;
-   * result is that the digits from 0 to k-1 are the residue.  So
+  
-   * we have to clear those afterwards.
+  /* alias for upper half of x, or x/B**m */
-   */
+  tmpx2 = x->dp + m;
-  for (i = k + k - 1; i >= k; i = i - 1) {
+  
-    /* x[i - 1 : i - k] += x[i]*mp */
+  /* set carry to zero */
-
+  mu = 0;
-    /* x[i] * mp */
+  
-    r = ((mp_word) *tmpi--) * ((mp_word) mp);
+  /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
-
+  for (i = 0; i < m; i++) {
-    /* now add r to x[i-1:i-k]
+      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
-     *
+      *tmpx1++  = r & MP_MASK;
-     * First add it to the first digit x[i-k] then form the carry
+      mu        = r >> ((mp_word)DIGIT_BIT);
     * then enter the main loop
     */
    j = i - k;
    /* alias for a->dp[j] */
    tmpj = a->dp + j;
    /* add digit */
    *tmpj += (mp_digit)(r & MP_MASK);
    /* this is the carry */
    mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);
    /* clear carry from a->dp[j]  */
    *tmpj++ &= MP_MASK;
    /* now add rest of the digits
     *
     * Note this is basically a simple single digit addition to
     * a larger multiple digit number.  This is optimized somewhat
     * because the propagation of carries is not likely to move
     * more than a few digits.
     *
     */
    for (++j; mu != 0 && j <= (i - 1); ++j) {
      *tmpj   += mu;
      mu       = *tmpj >> DIGIT_BIT;
      *tmpj++ &= MP_MASK;
    }
    /* if final carry */
    if (mu != 0) {
      /* add mp to this to correct */
      j = i - k;
      tmpj = a->dp + j;
      *tmpj += mp;
      mu = *tmpj >> DIGIT_BIT;
      *tmpj++ &= MP_MASK;
      /* now handle carries */
      for (++j; mu != 0 && j <= (i - 1); j++) {
          *tmpj   += mu;
          mu       = *tmpj >> DIGIT_BIT;
          *tmpj++ &= MP_MASK;
      }
    }
  }
-
+  
-  /* zero words above k */
+  /* set final carry */
-  tmpi = a->dp + k;
+  *tmpx1++ = mu;
-  for (i = k; i < a->used; i++) {
+  
-      *tmpi++ = 0;
+  /* zero words above m */
  for (i = m + 1; i < x->used; i++) {
      *tmpx1++ = 0;
  }
  /* clamp, sub and return */
-  mp_clamp (a);
+  mp_clamp (x);
-  /* if a >= b [b == modulus] then subtract the modulus to fix up */
+  /* if x >= n then subtract and reduce again 
-  if (mp_cmp_mag (a, b) != MP_LT) {
+   * Each successive "recursion" makes the input smaller and smaller.
-    return s_mp_sub (a, b, a);
+   */
  if (mp_cmp_mag (x, n) != MP_LT) {
    s_mp_sub(x, n, x);
    goto top;
  }
  return MP_OKAY;
 }
--- a/bn_mp_dr_setup.c
+++ b/bn_mp_dr_setup.c
@ -20,6 +20,7 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
   /* the casts are required if DIGIT_BIT is one less than
    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
    */
-   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0]));
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
        ((mp_word)a->dp[0]));
 }
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>
-/* calculate c = a^b  using a square-multiply algorithm */
+/* calculate c = a**b  using a square-multiply algorithm */
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@ -14,7 +14,6 @@
 */
 #include <tommath.h>
 static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
 /* this is a shell function that calls either the normal or Montgomery
 * exptmod functions.  Originally the call to the montgomery code was
@ -55,212 +54,22 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
        return err;
     }
-     /* and now compute (1/G)^|X| instead of G^X [X < 0] */
+     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
     err = mp_exptmod(&tmpG, &tmpX, P, Y);
     mp_clear_multi(&tmpG, &tmpX, NULL);
     return err;
  }
  dr = mp_dr_is_modulus(P);
  if (dr == 0) {
     dr = mp_reduce_is_2k(P) << 1;
  }
  /* if the modulus is odd use the fast method */
-  if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) {
+  if ((mp_isodd (P) == 1 || dr !=  0) && P->used > 4) {
    return mp_exptmod_fast (G, X, P, Y, dr);
  } else {
-    return f_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y);
  }
 }
 static int
 f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
  mp_int  M[256], res, mu;
  mp_digit buf;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
  /* find window size */
  x = mp_count_bits (X);
  if (x <= 7) {
    winsize = 2;
  } else if (x <= 36) {
    winsize = 3;
  } else if (x <= 140) {
    winsize = 4;
  } else if (x <= 450) {
    winsize = 5;
  } else if (x <= 1303) {
    winsize = 6;
  } else if (x <= 3529) {
    winsize = 7;
  } else {
    winsize = 8;
  }
 #ifdef MP_LOW_MEM
    if (winsize > 5) {
       winsize = 5;
    }
 #endif
  /* init G array */
  for (x = 0; x < (1 << winsize); x++) {
    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
      for (y = 0; y < x; y++) {
        mp_clear (&M[y]);
      }
      return err;
    }
  }
  /* create mu, used for Barrett reduction */
  if ((err = mp_init (&mu)) != MP_OKAY) {
    goto __M;
  }
  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
    goto __MU;
  }
  /* create M table
   *
   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
   *
   * The first half of the table is not computed though accept for M[0] and M[1]
   */
  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
    goto __MU;
  }
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
    goto __MU;
  }
  for (x = 0; x < (winsize - 1); x++) {
    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
      goto __MU;
    }
    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
      goto __MU;
    }
  }
  /* create upper table */
  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
      goto __MU;
    }
    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
      goto __MU;
    }
  }
  /* setup result */
  if ((err = mp_init (&res)) != MP_OKAY) {
    goto __MU;
  }
  mp_set (&res, 1);
  /* set initial mode and bit cnt */
  mode   = 0;
  bitcnt = 1;
  buf    = 0;
  digidx = X->used - 1;
  bitcpy = bitbuf = 0;
  for (;;) {
    /* grab next digit as required */
    if (--bitcnt == 0) {
      if (digidx == -1) {
        break;
      }
      buf = X->dp[digidx--];
      bitcnt = (int) DIGIT_BIT;
    }
    /* grab the next msb from the exponent */
    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
    buf <<= (mp_digit)1;
    /* if the bit is zero and mode == 0 then we ignore it
     * These represent the leading zero bits before the first 1 bit
     * in the exponent.  Technically this opt is not required but it
     * does lower the # of trivial squaring/reductions used
     */
    if (mode == 0 && y == 0)
      continue;
    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __RES;
      }
      continue;
    }
    /* else we add it to the window */
    bitbuf |= (y << (winsize - ++bitcpy));
    mode = 2;
    if (bitcpy == winsize) {
      /* ok window is filled so square as required and multiply  */
      /* square first */
      for (x = 0; x < winsize; x++) {
        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
          goto __RES;
        }
        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
          goto __RES;
        }
      }
      /* then multiply */
      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
        goto __MU;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __MU;
      }
      /* empty window and reset */
      bitcpy = bitbuf = 0;
      mode = 1;
    }
  }
  /* if bits remain then square/multiply */
  if (mode == 2 && bitcpy > 0) {
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __RES;
      }
      bitbuf <<= 1;
      if ((bitbuf & (1 << winsize)) != 0) {
        /* then multiply */
        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
          goto __RES;
        }
        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
          goto __RES;
        }
      }
    }
  }
  mp_exch (&res, Y);
  err = MP_OKAY;
 __RES:mp_clear (&res);
 __MU:mp_clear (&mu);
 __M:
  for (x = 0; x < (1 << winsize); x++) {
    mp_clear (&M[x]);
  }
  return err;
 }
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@ -27,6 +27,11 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  mp_int  M[256], res;
  mp_digit buf, mp;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
  /* use a pointer to the reduction algorithm.  This allows us to use
   * one of many reduction algorithms without modding the guts of
   * the code with if statements everywhere.  
   */
  int     (*redux)(mp_int*,mp_int*,mp_digit);
  /* find window size */
@ -64,6 +69,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    }
  }
  /* determine and setup reduction code */
  if (redmode == 0) {
     /* now setup montgomery  */
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
@ -71,17 +77,23 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     }
     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
-     if ( ((P->used * 2 + 1) < MP_WARRAY) &&
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
        redux = fast_mp_montgomery_reduce;
     } else {
        /* use slower baselien method */
        redux = mp_montgomery_reduce;
     }
-  } else {
+  } else if (redmode == 1) {
     /* setup DR reduction */
     mp_dr_setup(P, &mp);
     redux = mp_dr_reduce;
  } else {
     /* setup 2k reduction */
     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
        goto __M;
     }
     redux = mp_reduce_2k;
  }
  /* setup result */
@ -142,7 +154,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  bitcnt = 1;
  buf    = 0;
  digidx = X->used - 1;
-  bitcpy = bitbuf = 0;
+  bitcpy = 0;
  bitbuf = 0;
  for (;;) {
    /* grab next digit as required */
@ -203,7 +216,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      }
      /* empty window and reset */
-      bitcpy = bitbuf = 0;
+      bitcpy = 0;
      bitbuf = 0;
      mode = 1;
    }
  }
@ -233,7 +247,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  }
  if (redmode == 0) {
-     /* fixup result */
+     /* fixup result if Montgomery reduction is used */
     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
       goto __RES;
     }
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -24,7 +24,7 @@ mp_init (mp_int * a)
    return MP_MEM;
  }
-  /* set the used to zero, allocated digit to the default precision
+  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
  a->alloc = MP_PREC;
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -14,24 +14,34 @@
 */
 #include <tommath.h>
-/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications
+/* c = |a| * |b| using Karatsuba Multiplication using 
 * three half size multiplications
 *
- * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b)
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
 * let n represent half of the number of digits in 
 * the min(a,b)
 *
- * a = a1 * B^n + a0
+ * a = a1 * B**n + a0
- * b = b1 * B^n + b0
+ * b = b1 * B**n + b0
 *
- * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0
+ * Then, a * b => 
   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
 *
- * Note that a1b1 and a0b0 are used twice and only need to be computed once.  So in total
+ * Note that a1b1 and a0b0 are used twice and only need to be 
- * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0)
+ * computed once.  So in total three half size (half # of 
 * digit) multiplications are performed, a0b0, a1b1 and 
 * (a1-b1)(a0-b0)
 *
- * Note that a multiplication of half the digits requires 1/4th the number of single precision 
+ * Note that a multiplication of half the digits requires
- * multiplications so in total after one call 25% of the single precision multiplications are saved.
+ * 1/4th the number of single precision multiplications so in 
- * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above
+ * total after one call 25% of the single precision multiplications 
- * the threshold.  This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which
+ * are saved.  Note also that the call to mp_mul can end up back 
- * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use.  Generally though the 
+ * in this function if the a0, a1, b0, or b1 are above the threshold.  
- * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached.
+ * This is known as divide-and-conquer and leads to the famous 
 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
 * the standard O(N**2) that the baseline/comba methods use.  
 * Generally though the overhead of this method doesn't pay off 
 * until a certain size (N ~ 80) is reached.
 */
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
@ -101,14 +111,15 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    }
  }
-  /* only need to clamp the lower words since by definition the upper words x1/y1 must
+  /* only need to clamp the lower words since by definition the 
-   * have a known number of digits
+   * upper words x1/y1 must have a known number of digits
   */
  mp_clamp (&x0);
  mp_clamp (&y0);
  /* now calc the products x0y0 and x1y1 */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  /* after this x0 is no longer required, free temp [x0==t2]! */
+  /* after this x0 is no longer required, free temp [x0==t2]! */
  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
    goto X1Y1;          /* x0y0 = x0*y0 */
  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
    goto X1Y1;          /* x1y1 = x1*y1 */
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@ -14,10 +14,12 @@
 */
 #include <tommath.h>
-/* Karatsuba squaring, computes b = a*a using three half size squarings
+/* Karatsuba squaring, computes b = a*a using three 
 * half size squarings
 *
- * See comments of mp_karatsuba_mul for details.  It is essentially the same algorithm
+ * See comments of mp_karatsuba_mul for details.  It 
- * but merely tuned to perform recursive squarings.
+ * is essentially the same algorithm but merely 
 * tuned to perform recursive squarings.
 */
 int
 mp_karatsuba_sqr (mp_int * a, mp_int * b)
@ -74,32 +76,32 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)
  /* now calc the products x0*x0 and x1*x1 */
  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;                  /* x0x0 = x0*x0 */
+    goto X1X1;           /* x0x0 = x0*x0 */
  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;                  /* x1x1 = x1*x1 */
+    goto X1X1;           /* x1x1 = x1*x1 */
-  /* now calc (x1-x0)^2 */
+  /* now calc (x1-x0)**2 */
  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x1 - x0 */
+    goto X1X1;           /* t1 = x1 - x0 */
  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = (x1 - x0) * (x1 - x0) */
+    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
  /* add x0y0 */
  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;                  /* t2 = x0y0 + x1y1 */
+    goto X1X1;           /* t2 = x0x0 + x1x1 */
  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
  /* shift by B */
  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;                  /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;                  /* x1y1 = x1y1 << 2*B */
+    goto X1X1;           /* x1x1 = x1x1 << 2*B */
  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + t1 */
+    goto X1X1;           /* t1 = x0x0 + t1 */
  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + t1 + x1y1 */
+    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
  err = MP_OKAY;
--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@ -33,29 +33,29 @@ mp_lshd (mp_int * a, int b)
  }
  {
-    register mp_digit *tmpa, *tmpaa;
+    register mp_digit *top, *bottom;
-    /* increment the used by the shift amount than copy upwards */
+    /* increment the used by the shift amount then copy upwards */
    a->used += b;
    /* top */
-    tmpa = a->dp + a->used - 1;
+    top = a->dp + a->used - 1;
    /* base */
-    tmpaa = a->dp + a->used - 1 - b;
+    bottom = a->dp + a->used - 1 - b;
    /* much like mp_rshd this is implemented using a sliding window
     * except the window goes the otherway around.  Copying from
     * the bottom to the top.  see bn_mp_rshd.c for more info.
     */
    for (x = a->used - 1; x >= b; x--) {
-      *tmpa-- = *tmpaa--;
+      *top-- = *bottom--;
    }
    /* zero the lower digits */
-    tmpa = a->dp;
+    top = a->dp;
    for (x = 0; x < b; x++) {
-      *tmpa++ = 0;
+      *top++ = 0;
    }
  }
  return MP_OKAY;
--- a/bn_mp_mod_d.c
+++ b/bn_mp_mod_d.c
@ -17,31 +17,5 @@
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
-  mp_int  t, t2;
+  return mp_div_d(a, b, NULL, c);
  int     res;
  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
  }
  if ((res = mp_init (&t2)) != MP_OKAY) {
    mp_clear (&t);
    return res;
  }
  mp_set (&t, b);
  mp_div (a, &t, NULL, &t2);
  if (t2.sign == MP_NEG) {
    if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) {
      mp_clear (&t);
      mp_clear (&t2);
      return res;
    }
  }
  *c = t2.dp[0];
  mp_clear (&t);
  mp_clear (&t2);
  return MP_OKAY;
 }
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@ -14,12 +14,12 @@
 */
 #include <tommath.h>
-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+/* computes xR**-1 == x (mod N) via Montgomery Reduction */
 int
-mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
  int     ix, res, digs;
-  mp_digit ui;
+  mp_digit mu;
  /* can the fast reduction [comba] method be used?
   *
@ -27,55 +27,60 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
   * than the available columns [255 per default] since carries
   * are fixed up in the inner loop.
   */
-  digs = m->used * 2 + 1;
+  digs = n->used * 2 + 1;
-  if ((digs < MP_WARRAY)
+  if ((digs < MP_WARRAY) && 
-      && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      n->used < 
-    return fast_mp_montgomery_reduce (a, m, mp);
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
    return fast_mp_montgomery_reduce (x, n, rho);
  }
  /* grow the input as required */
-  if (a->alloc < m->used * 2 + 1) {
+  if (x->alloc < digs) {
-    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
+    if ((res = mp_grow (x, digs)) != MP_OKAY) {
      return res;
    }
  }
-  a->used = m->used * 2 + 1;
+  x->used = digs;
-  for (ix = 0; ix < m->used; ix++) {
+  for (ix = 0; ix < n->used; ix++) {
-    /* ui = ai * m' mod b */
+    /* mu = ai * m' mod b */
-    ui = (a->dp[ix] * mp) & MP_MASK;
+    mu = (x->dp[ix] * rho) & MP_MASK;
-    /* a = a + ui * m * b^i */
+    /* a = a + mu * m * b**i */
    {
      register int iy;
-      register mp_digit *tmpx, *tmpy, mu;
+      register mp_digit *tmpn, *tmpx, u;
      register mp_word r;
      /* aliases */
-      tmpx = m->dp;
+      tmpn = n->dp;
-      tmpy = a->dp + ix;
+      tmpx = x->dp + ix;
-      mu = 0;
+      /* set the carry to zero */
-      for (iy = 0; iy < m->used; iy++) {
+      u = 0;
-        r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
+      
-        mu = (r >> ((mp_word) DIGIT_BIT));
+      /* Multiply and add in place */
-        *tmpy++ = (r & ((mp_word) MP_MASK));
+      for (iy = 0; iy < n->used; iy++) {
        r = ((mp_word) mu) * ((mp_word) * tmpn++) + 
            ((mp_word) u) + ((mp_word) * tmpx);
        u = (r >> ((mp_word) DIGIT_BIT));
        *tmpx++ = (r & ((mp_word) MP_MASK));
      }
      /* propagate carries */
-      while (mu) {
+      while (u) {
-        *tmpy += mu;
+        *tmpx += u;
-        mu = (*tmpy >> DIGIT_BIT) & 1;
+        u = *tmpx >> DIGIT_BIT;
-        *tmpy++ &= MP_MASK;
+        *tmpx++ &= MP_MASK;
      }
    }
  }
-  /* A = A/b^n */
+  /* x = x/b**n.used */
-  mp_rshd (a, m->used);
+  mp_rshd (x, n->used);
  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
+  if (mp_cmp_mag (x, n) != MP_LT) {
-    return s_mp_sub (a, m, a);
+    return s_mp_sub (x, n, x);
  }
  return MP_OKAY;
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@ -16,38 +16,38 @@
 /* setups the montgomery reduction stuff */
 int
-mp_montgomery_setup (mp_int * a, mp_digit * mp)
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
 {
  mp_digit x, b;
-/* fast inversion mod 2^k
+/* fast inversion mod 2**k
 *
 * Based on the fact that
 *
- * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
- *                   =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*X*A - X*X*A*A = 1
- *                   =>  2*(1) - (1)     = 1
+ *                    =>  2*(1) - (1)     = 1
 */
-  b = a->dp[0];
+  b = n->dp[0];
  if ((b & 1) == 0) {
    return MP_VAL;
  }
-  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-  x *= 2 - b * x;               /* here x*a==1 mod 2^8 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
 #if !defined(MP_8BIT)
-  x *= 2 - b * x;               /* here x*a==1 mod 2^16; each step doubles the nb of bits */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
 #endif
 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-  x *= 2 - b * x;               /* here x*a==1 mod 2^32 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
 #endif
 #ifdef MP_64BIT
-  x *= 2 - b * x;               /* here x*a==1 mod 2^64 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
 #endif
-  /* t = -1/m mod b */
+  /* rho = -1/m mod b */
-  *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
  return MP_OKAY;
 }
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -20,19 +20,24 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
 {
  int     res, neg;
  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
+  
  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
    res = mp_toom_mul(a, b, c);
  } else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
    res = mp_karatsuba_mul (a, b, c);
  } else {
    /* can we use the fast multiplier?
     *
-     * The fast multiplier can be used if the output will have less than
+     * The fast multiplier can be used if the output will 
-     * MP_WARRAY digits and the number of digits won't affect carry propagation
+     * have less than MP_WARRAY digits and the number of 
     * digits won't affect carry propagation
     */
    int     digs = a->used + b->used + 1;
-    if ((digs < MP_WARRAY)
+    if ((digs < MP_WARRAY) &&
-        && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        MIN(a->used, b->used) <= 
        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
      res = fast_s_mp_mul_digs (a, b, c, digs);
    } else {
      res = s_mp_mul (a, b, c);
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@ -14,22 +14,8 @@
 */
 #include <tommath.h>
-/* pre-calculate the value required for Barrett reduction
+/* reduces x mod m, assumes 0 < x < m**2, mu is 
- * For a given modulus "b" it calulates the value required in "a"
+ * precomputed via mp_reduce_setup.
 */
 int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
  int     res;
  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
    return res;
  }
  res = mp_div (a, b, a, NULL);
  return res;
 }
 /* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup
 * From HAC pp.604 Algorithm 14.42
 */
 int
@ -38,11 +24,12 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
  mp_int  q;
  int     res, um = m->used;
  /* q = x */
  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
    return res;
  }
-  /* q1 = x / b^(k-1)  */
+  /* q1 = x / b**(k-1)  */
  mp_rshd (&q, um - 1);         
  /* according to HAC this is optimization is ok */
@ -56,15 +43,15 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
    }
  }
-  /* q3 = q2 / b^(k+1) */
+  /* q3 = q2 / b**(k+1) */
  mp_rshd (&q, um + 1);         
-  /* x = x mod b^(k+1), quick (no division) */
+  /* x = x mod b**(k+1), quick (no division) */
  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
    goto CLEANUP;
  }
-  /* q = q * m mod b^(k+1), quick (no division) */
+  /* q = q * m mod b**(k+1), quick (no division) */
  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
    goto CLEANUP;
  }
@ -74,7 +61,7 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
    goto CLEANUP;
  }
-  /* If x < 0, add b^(k+1) to it */
+  /* If x < 0, add b**(k+1) to it */
  if (mp_cmp_d (x, 0) == MP_LT) {
    mp_set (&q, 1);
    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
@ -89,7 +76,7 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
      break;
    }
  }
-
+  
 CLEANUP:
  mp_clear (&q);
--- a/bn_mp_reduce_2k.c
+++ b/bn_mp_reduce_2k.c
@ -0,0 +1,56 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* reduces a modulo n where n is of the form 2**p - k */
 int
 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
 {
   mp_int q;
   int    p, res;
   if ((res = mp_init(&q)) != MP_OKAY) {
      return res;
   }
   p = mp_count_bits(n);    
 top:
   /* q = a/2**p, a = a mod 2**p */
   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
      goto ERR;
   }
   if (k != 1) {
      /* q = q * k */
      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
         goto ERR;
      }
   }
   /* a = a + q */
   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
      goto ERR;
   }
   if (mp_cmp_mag(a, n) != MP_LT) {
      s_mp_sub(a, n, a);
      goto top;
   }
 ERR:
   mp_clear(&q);
   return res;
 }
--- a/bn_mp_reduce_2k_setup.c
+++ b/bn_mp_reduce_2k_setup.c
@ -0,0 +1,42 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* determines the setup value */
 int 
 mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 {
   int res, p;
   mp_int tmp;
   if ((res = mp_init(&tmp)) != MP_OKAY) {
      return res;
   }
   p = mp_count_bits(a);
   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
      mp_clear(&tmp);
      return res;
   }
   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
      mp_clear(&tmp);
      return res;
   }
   *d = tmp.dp[0];
   mp_clear(&tmp);
   return MP_OKAY;
 }
--- a/bn_mp_reduce_is_2k.c
+++ b/bn_mp_reduce_is_2k.c
@ -0,0 +1,37 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* determines if mp_reduce_2k can be used */
 int 
 mp_reduce_is_2k(mp_int *a)
 {
   int ix, iy;
   if (a->used == 0) {
      return 0;
   } else if (a->used == 1) {
      return 1;
   } else if (a->used > 1) {
      iy = mp_count_bits(a);
      for (ix = DIGIT_BIT; ix < iy; ix++) {
          if ((a->dp[ix/DIGIT_BIT] & ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
             return 0;
          }
      }
   }
   return 1;
 }
--- a/bn_mp_reduce_setup.c
+++ b/bn_mp_reduce_setup.c
@ -0,0 +1,29 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* pre-calculate the value required for Barrett reduction
 * For a given modulus "b" it calulates the value required in "a"
 */
 int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
  int     res;
  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
    return res;
  }
  return mp_div (a, b, a, NULL);
 }
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@ -32,15 +32,15 @@ mp_rshd (mp_int * a, int b)
  }
  {
-    register mp_digit *tmpa, *tmpaa;
+    register mp_digit *bottom, *top;
    /* shift the digits down */
-    /* base */
+    /* bottom */
-    tmpa = a->dp;
+    bottom = a->dp;
-    /* offset into digits */
+    /* top [offset into digits] */
-    tmpaa = a->dp + b;
+    top = a->dp + b;
    /* this is implemented as a sliding window where 
     * the window is b-digits long and digits from 
@ -53,13 +53,15 @@ mp_rshd (mp_int * a, int b)
                  \-------------------/      ---->
     */
    for (x = 0; x < (a->used - b); x++) {
-      *tmpa++ = *tmpaa++;
+      *bottom++ = *top++;
    }
    /* zero the top digits */
    for (; x < a->used; x++) {
-      *tmpa++ = 0;
+      *bottom++ = 0;
    }
  }
-  mp_clamp (a);
+  
  /* remove excess digits */
  a->used -= b;
 }
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@ -35,7 +35,7 @@ mp_set_int (mp_int * a, unsigned int b)
    b <<= 4;
    /* ensure that digits are not clamped off */
-    a->used += 32 / DIGIT_BIT + 2;
+    a->used += 1;
  }
  mp_clamp (a);
  return MP_OKAY;
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@ -19,12 +19,16 @@ int
 mp_sqr (mp_int * a, mp_int * b)
 {
  int     res;
-  if (a->used > KARATSUBA_SQR_CUTOFF) {
+  if (a->used >= TOOM_SQR_CUTOFF) {
    res = mp_toom_sqr(a, b);
  } else if (a->used >= KARATSUBA_SQR_CUTOFF) {
    res = mp_karatsuba_sqr (a, b);
  } else {
    /* can we use the fast multiplier? */
-    if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+    if ((a->used * 2 + 1) < MP_WARRAY && 
         a->used < 
         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
      res = fast_s_mp_sqr (a, b);
    } else {
      res = s_mp_sqr (a, b);
--- a/bn_mp_toom_mul.c
+++ b/bn_mp_toom_mul.c
@ -0,0 +1,268 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* multiplication using Toom-Cook 3-way algorithm */
 int 
 mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
    int res, B;
    /* init temps */
    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
       return res;
    }
    /* B */
    B = MIN(a->used, b->used) / 3;
    /* a = a2 * B^2 + a1 * B + a0 */
    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&a1, B);
    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&a2, B*2);
    /* b = b2 * B^2 + b1 * B + b0 */
    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&b1, B);
    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&b2, B*2);
    /* w0 = a0*b0 */
    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
       goto ERR;
    }
    /* w4 = a2 * b2 */
    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
       goto ERR;
    }
    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
       goto ERR;
    }
    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
       goto ERR;
    }
    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
       goto ERR;
    }
    /* now solve the matrix 
       0  0  0  0  1
       1  2  4  8  16
       1  1  1  1  1
       16 8  4  2  1
       1  0  0  0  0
       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication 
     */
     /* r1 - r4 */
     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r0 */
     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1/2 */
     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3/2 */
     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r2 - r0 - r4 */
     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - r2 */
     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r2 */
     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - 8r0 */
     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - 8r4 */
     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* 3r2 - r1 - r3 */
     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - r2 */
     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r2 */
     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1/3 */
     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
        goto ERR;
     }
     /* r3/3 */
     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
        goto ERR;
     }
     /* at this point shift W[n] by B*n */
     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
        goto ERR;
     }     
     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
        goto ERR;
     }     
 ERR:
     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL);
     return res;
 }     
--- a/bn_mp_toom_sqr.c
+++ b/bn_mp_toom_sqr.c
@ -0,0 +1,220 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 /* squaring using Toom-Cook 3-way algorithm */
 int 
 mp_toom_sqr(mp_int *a, mp_int *b)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
    int res, B;
    /* init temps */
    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
       return res;
    }
    /* B */
    B = a->used / 3;
    /* a = a2 * B^2 + a1 * B + a0 */
    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&a1, B);
    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
       goto ERR;
    }
    mp_rshd(&a2, B*2);
    /* w0 = a0*a0 */
    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
       goto ERR;
    }
    /* w4 = a2 * a2 */
    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
       goto ERR;
    }
    /* w1 = (a2 + 2(a1 + 2a0))**2 */
    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
       goto ERR;
    }
    /* w3 = (a0 + 2(a1 + 2a2))**2 */
    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
       goto ERR;
    }
    /* w2 = (a2 + a1 + a0)**2 */
    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
       goto ERR;
    }
    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
       goto ERR;
    }
    /* now solve the matrix 
       0  0  0  0  1
       1  2  4  8  16
       1  1  1  1  1
       16 8  4  2  1
       1  0  0  0  0
       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
     */
     /* r1 - r4 */
     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r0 */
     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1/2 */
     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3/2 */
     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r2 - r0 - r4 */
     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - r2 */
     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r2 */
     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - 8r0 */
     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - 8r4 */
     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* 3r2 - r1 - r3 */
     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
        goto ERR;
     }
     /* r1 - r2 */
     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
        goto ERR;
     }
     /* r3 - r2 */
     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
        goto ERR;
     }
     /* r1/3 */
     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
        goto ERR;
     }
     /* r3/3 */
     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
        goto ERR;
     }
     /* at this point shift W[n] by B*n */
     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
        goto ERR;
     }     
     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
        goto ERR;
     }
     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
        goto ERR;
     }     
 ERR:
     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
     return res;
 }     
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@ -45,7 +45,6 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
  olduse = c->used;
  c->used = max + 1;
  /* set the carry to zero */
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
    register int i;
--- a/bn_s_mp_exptmod.c
+++ b/bn_s_mp_exptmod.c
@ -0,0 +1,211 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision
 * integer arithmetic as well as number theoretic functionality.
 *
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with
 * additional optimizations in place.
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
 */
 #include <tommath.h>
 int
 s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
  mp_int  M[256], res, mu;
  mp_digit buf;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
  /* find window size */
  x = mp_count_bits (X);
  if (x <= 7) {
    winsize = 2;
  } else if (x <= 36) {
    winsize = 3;
  } else if (x <= 140) {
    winsize = 4;
  } else if (x <= 450) {
    winsize = 5;
  } else if (x <= 1303) {
    winsize = 6;
  } else if (x <= 3529) {
    winsize = 7;
  } else {
    winsize = 8;
  }
 #ifdef MP_LOW_MEM
    if (winsize > 5) {
       winsize = 5;
    }
 #endif
  /* init M array */
  for (x = 0; x < (1 << winsize); x++) {
    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
      for (y = 0; y < x; y++) {
        mp_clear (&M[y]);
      }
      return err;
    }
  }
  /* create mu, used for Barrett reduction */
  if ((err = mp_init (&mu)) != MP_OKAY) {
    goto __M;
  }
  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
    goto __MU;
  }
  /* create M table
   *
   * The M table contains powers of the input base, e.g. M[x] = G**x mod P
   *
   * The first half of the table is not computed though accept for M[0] and M[1]
   */
  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
    goto __MU;
  }
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
    goto __MU;
  }
  for (x = 0; x < (winsize - 1); x++) {
    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
      goto __MU;
    }
    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
      goto __MU;
    }
  }
  /* create upper table */
  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
      goto __MU;
    }
    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
      goto __MU;
    }
  }
  /* setup result */
  if ((err = mp_init (&res)) != MP_OKAY) {
    goto __MU;
  }
  mp_set (&res, 1);
  /* set initial mode and bit cnt */
  mode   = 0;
  bitcnt = 1;
  buf    = 0;
  digidx = X->used - 1;
  bitcpy = 0;
  bitbuf = 0;
  for (;;) {
    /* grab next digit as required */
    if (--bitcnt == 0) {
      if (digidx == -1) {
        break;
      }
      buf = X->dp[digidx--];
      bitcnt = (int) DIGIT_BIT;
    }
    /* grab the next msb from the exponent */
    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
    buf <<= (mp_digit)1;
    /* if the bit is zero and mode == 0 then we ignore it
     * These represent the leading zero bits before the first 1 bit
     * in the exponent.  Technically this opt is not required but it
     * does lower the # of trivial squaring/reductions used
     */
    if (mode == 0 && y == 0)
      continue;
    /* if the bit is zero and mode == 1 then we square */
    if (mode == 1 && y == 0) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __RES;
      }
      continue;
    }
    /* else we add it to the window */
    bitbuf |= (y << (winsize - ++bitcpy));
    mode = 2;
    if (bitcpy == winsize) {
      /* ok window is filled so square as required and multiply  */
      /* square first */
      for (x = 0; x < winsize; x++) {
        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
          goto __RES;
        }
        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
          goto __RES;
        }
      }
      /* then multiply */
      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
        goto __MU;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __MU;
      }
      /* empty window and reset */
      bitcpy = 0;
      bitbuf = 0;
      mode = 1;
    }
  }
  /* if bits remain then square/multiply */
  if (mode == 2 && bitcpy > 0) {
    /* square then multiply if the bit is set */
    for (x = 0; x < bitcpy; x++) {
      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
        goto __RES;
      }
      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
        goto __RES;
      }
      bitbuf <<= 1;
      if ((bitbuf & (1 << winsize)) != 0) {
        /* then multiply */
        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
          goto __RES;
        }
        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
          goto __RES;
        }
      }
    }
  }
  mp_exch (&res, Y);
  err = MP_OKAY;
 __RES:mp_clear (&res);
 __MU:mp_clear (&mu);
 __M:
  for (x = 0; x < (1 << winsize); x++) {
    mp_clear (&M[x]);
  }
  return err;
 }
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@ -20,8 +20,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
 {
  mp_int  t;
  int     res, ix, iy, pa;
-  mp_word r, u;
+  mp_word r;
-  mp_digit tmpx, *tmpt;
+  mp_digit u, tmpx, *tmpt;
  pa = a->used;
  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
@ -32,7 +32,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
  for (ix = 0; ix < pa; ix++) {
    /* first calculate the digit at 2*ix */
    /* calculate double precision result */
-    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[ix + ix]) + 
        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
    /* store lower part in result */
    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
@ -44,7 +45,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
    tmpx = a->dp[ix];
    /* alias for where to store the results */
-    tmpt = &(t.dp[ix + ix + 1]);
+    tmpt = t.dp + (ix + ix + 1);
    for (iy = ix + 1; iy < pa; iy++) {
      /* first calculate the product */
      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
@ -60,13 +62,9 @@ s_mp_sqr (mp_int * a, mp_int * b)
      /* get carry */
      u = (r >> ((mp_word) DIGIT_BIT));
    }
    r = ((mp_word) * tmpt) + u;
    *tmpt = (mp_digit) (r & ((mp_word) MP_MASK));
    u = (r >> ((mp_word) DIGIT_BIT));
    /* propagate upwards */
-    ++tmpt;
+    while (u != ((mp_digit) 0)) {
-    while (u != ((mp_word) 0)) {
+      r = ((mp_word) * tmpt) + ((mp_word) u);
      r = ((mp_word) * tmpt) + ((mp_word) 1);
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
      u = (r >> ((mp_word) DIGIT_BIT));
    }
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@ -33,7 +33,6 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  olduse = c->used;
  c->used = max;
  /* sub digits from lower part */
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
    register int i;
@ -52,7 +51,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
      /* U = carry bit of T[i]
       * Note this saves performing an AND operation since
       * if a carry does occur it will propagate all the way to the
-       * MSB.  As a result a single shift is required to get the carry
+       * MSB.  As a result a single shift is enough to get the carry
       */
      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
@ -81,3 +80,4 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  mp_clamp (c);
  return MP_OKAY;
 }
--- a/bncore.c
+++ b/bncore.c
@ -18,11 +18,14 @@
 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        81/       110
+ Intel P4               /GCC v3.2     /        70/       108
 AMD Athlon XP          /GCC v3.2     /       109/       127
 */
 /* configured for a AMD XP Thoroughbred core with etc/tune.c */
 int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127;      /* Min. number of digits before Karatsuba squaring is used. */
+        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
        TOOM_SQR_CUTOFF      = 400; 
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,15 @@
 May 29th, 2003
 v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
          (e.g. correct result, just bad looking code)
       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
       -- Added Toom-Cook multipliers [needs tuning!]
       -- Added efficient divide by 3 algorithm mp_div_3
       -- Re-wrote mp_div_d to be faster than calling mp_div 
       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
       -- Made the exptmod system be aware of the 2k reduction algorithms.
       -- Rewrote mp_dr_reduce to be smaller, simpler and easier to understand.
 May 17th, 2003
 v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
          as several smaller suggestions.  Thanks!
--- a/demo/demo.c
+++ b/demo/demo.c
@ -53,7 +53,7 @@ int main(void)
 #ifdef TIMER
   int n;
   ulong64 tt;
-   FILE *log, *logb;
+   FILE *log, *logb, *logc;
 #endif
   mp_init(&a);
@ -62,11 +62,54 @@ int main(void)
   mp_init(&d);
   mp_init(&e);
   mp_init(&f);   
   srand(time(NULL));
 /* test mp_reduce_2k */
 #if 0
   for (cnt = 3; cnt <= 4096; ++cnt) {
       mp_digit tmp;
       mp_2expt(&a, cnt);
       mp_sub_d(&a, 1, &a);  /* a = 2**cnt - 1 */
       printf("\nTesting %4d bits", cnt);
       printf("(%d)", mp_reduce_is_2k(&a));
       mp_reduce_2k_setup(&a, &tmp);
       printf("(%d)", tmp);
       for (ix = 0; ix < 100000; ix++) {
           if (!(ix & 1023)) {printf("."); fflush(stdout); }
           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
           mp_copy(&c, &b);
           mp_mod(&c, &a, &c);
           mp_reduce_2k(&b, &a, 1);
           if (mp_cmp(&c, &b)) {
              printf("FAILED\n");
              exit(0);
           }
        }
    }
 #endif
 /* test mp_div_3  */
 #if 0
   for (cnt = 0; cnt < 1000000; ) {
      mp_digit r1, r2;
      if (!(++cnt & 127)) printf("%9d\r", cnt);
      mp_rand(&a, abs(rand()) % 32 + 1);
      mp_div_d(&a, 3, &b, &r1);
      mp_div_3(&a, &c, &r2);
      if (mp_cmp(&b, &c) || r1 != r2) {
         printf("Failure\n");
      }
   }
 #endif     
 /* test the DR reduction */
 #if 0
   srand(time(NULL));
   for (cnt = 2; cnt < 32; cnt++) {
       printf("%d digit modulus\n", cnt);
       mp_grow(&a, cnt);
@ -91,6 +134,7 @@ int main(void)
         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);
         }
      } while (++rr < 1000000);
      printf("Passed DR test for %d digits\n", cnt);
@ -98,6 +142,9 @@ int main(void)
 #endif
 #ifdef TIMER
      /* temp. turn off TOOM */
      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
      log = fopen("logs/add.log", "w");
@ -172,9 +219,16 @@ int main(void)
      }
      fclose(log);
   }
-
+  {
   {
      char *primes[] = {
         /* 2K moduli mersenne primes */
         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
         /* DR moduli */
         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
@ -196,6 +250,7 @@ int main(void)
      };
   log = fopen("logs/expt.log", "w");
   logb = fopen("logs/expt_dr.log", "w");
   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
@ -224,11 +279,12 @@ int main(void)
         exit(0);
      }
      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf((n < 7) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
   }
   }
   fclose(log);
   fclose(logb);
   fclose(logc);
   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
@ -263,6 +319,12 @@ int main(void)
   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0;
   /* force KARA and TOOM to enable despite cutoffs */
   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
   TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
   for (;;) {
       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@ -0,0 +1,2 @@
 256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
 512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
--- a/etc/2kprime.c
+++ b/etc/2kprime.c
@ -0,0 +1,80 @@
 /* Makes safe primes of a 2k nature */
 #include <tommath.h>
 #include <time.h>
 int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
 int main(void)
 {
   char buf[2000];
   int x, y, t;
   mp_int q, p;
   FILE *out;
   clock_t t1;
   mp_digit z;
   mp_init_multi(&q, &p, NULL);
   out = fopen("2kprime.1", "w");
   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
   top:
       mp_2expt(&q, sizes[x]);
       mp_add_d(&q, 3, &q);
       z = -3;
       t1 = clock();
       for(;;) {
         mp_sub_d(&q, 4, &q);
         z += 4;
         if (z > MP_MASK) {
            printf("No primes of size %d found\n", sizes[x]);
            break;
         }
         if (clock() - t1 > CLOCKS_PER_SEC) { 
            printf("."); fflush(stdout);
 //            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
            t1 = clock();
         }
         /* quick test on q */
         mp_prime_is_prime(&q, 1, &y);
         if (y == 0) {
            continue;
         }
         /* find (q-1)/2 */
         mp_sub_d(&q, 1, &p);
         mp_div_2(&p, &p);
         mp_prime_is_prime(&p, 3, &y);
         if (y == 0) {
            continue;
         }
         /* test on q */
         mp_prime_is_prime(&q, 3, &y);
         if (y == 0) {
            continue;
         }
         break;
       }
       if (y == 0) {
          ++sizes[x];
          goto top;
       }
       mp_toradix(&q, buf, 10);
       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
   }
   return 0;
 }   
--- a/etc/makefile
+++ b/etc/makefile
@ -32,9 +32,13 @@ mersenne: mersenne.o
 drprime: drprime.o
 	$(CC) drprime.o $(LIBNAME) -o drprime
 # fines 2k safe primes for the given config
 2kprime: 2kprime.o
 	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
 mont: mont.o
 	$(CC) mont.o $(LIBNAME) -o mont
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime
--- a/etc/makefile.msvc
+++ b/etc/makefile.msvc
@ -14,4 +14,7 @@ tune: tune.obj
 	cl tune.obj ../tommath.lib
 drprime: drprime.obj
-	cl drprime.obj ../tommath.lib
+	cl drprime.obj ../tommath.lib
 2kprime: 2kprime.obj
 	cl 2kprime.obj ../tommath.lib
--- a/etc/mersenne.c
+++ b/etc/mersenne.c
@ -8,10 +8,9 @@
 int
 is_mersenne (long s, int *pp)
 {
-  mp_int  n, u, mu;
+  mp_int  n, u;
  int     res, k;
-  long    ss;
+  
  *pp = 0;
  if ((res = mp_init (&n)) != MP_OKAY) {
@ -22,27 +21,14 @@ is_mersenne (long s, int *pp)
    goto __N;
  }
  if ((res = mp_init (&mu)) != MP_OKAY) {
    goto __U;
  }
  /* n = 2^s - 1 */
-  mp_set (&n, 1);
+  if ((res = mp_2expt(&n, s)) != MP_OKAY) {
-  ss = s;
+     goto __MU;
  while (ss--) {
    if ((res = mp_mul_2 (&n, &n)) != MP_OKAY) {
      goto __MU;
    }
  }
  if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
    goto __MU;
  }
  /* setup mu */
  if ((res = mp_reduce_setup (&mu, &n)) != MP_OKAY) {
    goto __MU;
  }
  /* set u=4 */
  mp_set (&u, 4);
@ -57,26 +43,26 @@ is_mersenne (long s, int *pp)
    }
    /* make sure u is positive */
-    if (u.sign == MP_NEG) {
+    while (u.sign == MP_NEG) {
      if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
-	goto __MU;
+         goto __MU;
      }
    }
    /* reduce */
-    if ((res = mp_reduce (&u, &n, &mu)) != MP_OKAY) {
+    if ((res = mp_reduce_2k (&u, &n, 1)) != MP_OKAY) {
      goto __MU;
    }
  }
  /* if u == 0 then its prime */
  if (mp_iszero (&u) == 1) {
-    *pp = 1;
+    mp_prime_is_prime(&n, 3, pp);
  if (*pp != 1) printf("FAILURE\n");
  }
  res = MP_OKAY;
-__MU:mp_clear (&mu);
+__MU:mp_clear (&u);
 __U:mp_clear (&u);
 __N:mp_clear (&n);
  return res;
 }
--- a/etc/mont.c
+++ b/etc/mont.c
@ -7,10 +7,11 @@ int main(void)
   mp_digit mp;
   long x, y;
   srand(time(NULL));
   mp_init_multi(&modulus, &R, &p, &pp, NULL);
   /* loop through various sizes */
-   for (x = 4; x < 128; x++) {
+   for (x = 4; x < 256; x++) {
       printf("DIGITS == %3ld...", x); fflush(stdout);
       /* make up the odd modulus */
@ -22,7 +23,7 @@ int main(void)
       mp_montgomery_setup(&modulus, &mp);
       /* now run through a bunch tests */
-       for (y = 0; y < 100000; y++) {
+       for (y = 0; y < 1000; y++) {
           mp_rand(&p, x/2);        /* p = random */
           mp_mul(&p, &R, &pp);     /* pp = R * p */
           mp_montgomery_reduce(&pp, &modulus, mp);
--- a/etc/tune.c
+++ b/etc/tune.c
@ -8,17 +8,17 @@
 #ifndef X86_TIMER
 /* generic ISO C timer */
-unsigned long long __T;
+ulong64 __T;
 void t_start(void) { __T = clock(); }
-unsigned long long t_read(void) { return clock() - __T; }
+ulong64 t_read(void) { return clock() - __T; }
 #else
 extern void t_start(void);
-extern unsigned long long t_read(void);
+extern ulong64 t_read(void);
 #endif
-unsigned long long
+ulong64
-time_mult (void)
+time_mult (int max)
 {
  int     x, y;
  mp_int  a, b, c;
@ -28,7 +28,7 @@ time_mult (void)
  mp_init (&c);
  t_start();
-  for (x = 32; x <= 288; x += 4) {
+  for (x = 32; x <= max; x += 4) {
    mp_rand (&a, x);
    mp_rand (&b, x);
    for (y = 0; y < 100; y++) {
@ -41,8 +41,8 @@ time_mult (void)
  return t_read();
 }
-unsigned long long
+ulong64
-time_sqr (void)
+time_sqr (int max)
 {
  int     x, y;
  mp_int  a, b;
@ -51,7 +51,7 @@ time_sqr (void)
  mp_init (&b);
  t_start();
-  for (x = 32; x <= 288; x += 4) {
+  for (x = 32; x <= max; x += 4) {
    mp_rand (&a, x);
    for (y = 0; y < 100; y++) {
      mp_sqr (&a, &b);
@ -65,45 +65,85 @@ time_sqr (void)
 int
 main (void)
 {
-  int     best_mult, best_square;
+  int     best_kmult, best_tmult, best_ksquare, best_tsquare;
-  unsigned long long best, ti;
+  ulong64 best, ti;
  FILE   *log;
-  best_mult = best_square = 0;
+  best_kmult = best_ksquare = best_tmult = best_tsquare = 0;
  /* tune multiplication first */
  /* effectively turn TOOM off */
  TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 100000;
  log = fopen ("mult.log", "w");
  best = -1;
  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) {
-    ti = time_mult ();
+    ti = time_mult (300);
    printf ("%4d : %9llu\r", KARATSUBA_MUL_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
      best = ti;
-      best_mult = KARATSUBA_MUL_CUTOFF;
+      best_kmult = KARATSUBA_MUL_CUTOFF;
    }
  }
  fclose (log);
  /* tune squaring */
  log = fopen ("sqr.log", "w");
  best = -1;
  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) {
-    ti = time_sqr ();
+    ti = time_sqr (300);
    printf ("%4d : %9llu\r", KARATSUBA_SQR_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
      best = ti;
-      best_square = KARATSUBA_SQR_CUTOFF;
+      best_ksquare = KARATSUBA_SQR_CUTOFF;
    }
  }
  fclose (log);
  KARATSUBA_MUL_CUTOFF = best_kmult;
  KARATSUBA_SQR_CUTOFF = best_ksquare;
  /* tune TOOM mult */
  log = fopen ("tmult.log", "w");
  best = -1;
  for (TOOM_MUL_CUTOFF = best_kmult*5; TOOM_MUL_CUTOFF <= 800; TOOM_MUL_CUTOFF++) {
    ti = time_mult (1200);
    printf ("%4d : %9llu\r", TOOM_MUL_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", TOOM_MUL_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, TOOM_MUL_CUTOFF);
      best = ti;
      best_tmult = TOOM_MUL_CUTOFF;
    }
  }
  fclose (log);   
  /* tune TOOM sqr */
  log = fopen ("tsqr.log", "w");
  best = -1;
  for (TOOM_SQR_CUTOFF = best_ksquare*3; TOOM_SQR_CUTOFF <= 800; TOOM_SQR_CUTOFF++) {
    ti = time_sqr (1200);
    printf ("%4d : %9llu\r", TOOM_SQR_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", TOOM_SQR_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, TOOM_SQR_CUTOFF);
      best = ti;
      best_tsquare = TOOM_SQR_CUTOFF;
    }
  }
  fclose (log);   
  printf
-    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n",
+    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\nToom Multiplier Cutoff: %d\nToom Squaring Cutoff: %d\n",
-     best_mult, best_square);
+     best_kmult, best_ksquare, best_tmult, best_tsquare);
  return 0;
 }
--- a/gen.pl
+++ b/gen.pl
@ -6,7 +6,7 @@
 use strict;
 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "bn_*.c") {
+foreach my $filename (glob "bn*.c") {
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
   print OUT qq[#line 0 "$filename"\n];
@ -14,5 +14,5 @@ foreach my $filename (glob "bn_*.c") {
   print OUT "\n/* End: $filename */\n\n";
   close SRC or die "Error closing $filename after reading: $!";
 }
-print OUT "\b/* EOF */\n";
+print OUT "\n/* EOF */\n";
 close OUT or die "Error closing mpi.c after writing: $!";
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224  11039864
+224  11069160
-448   9206336
+448   9156136
-672   8178200
+672   8089755
-896   7432176
+896   7399424
-1120   6433264
+1120   6389352
-1344   5847056
+1344   5818648
-1568   5270184
+1568   5257112
-1792   4943416
+1792   4982160
-2016   4520016
+2016   4527856
-2240   4256168
+2240   4325312
-2464   3999224
+2464   4051760
-2688   3714896
+2688   3767640
-2912   3572720
+2912   3612520
-3136   3340176
+3136   3415208
-3360   3222584
+3360   3258656
-3584   3036336
+3584   3113360
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -1,7 +1,7 @@
-14364       666
+513       680
-21532       253
+769       257
-28700       117
+1025       117
-57372        17
+2049        17
-71708         9
+2561         9
-86044         5
+3073         5
-114716         2
+4097         2
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -0,0 +1,6 @@
 521       736
 607       552
 1279       112
 2203        33
 3217        13
 4253         6
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -1,7 +1,7 @@
-14896      1088
+532      1064
-21952       468
+784       460
-29008       244
+1036       240
-43120        91
+1540        91
-58016        43
+2072        43
-86240        15
+3080        15
-115248         6
+4116         6
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -1,5 +1,5 @@
 set terminal png color
-set size 1.5
+set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
@ -10,7 +10,7 @@ set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
 set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +1,32 @@
-112     15608
+112     16248
-224      7840
+224      8192
-336      5104
+336      5320
-448      3376
+448      3560
-560      2616
+560      2728
-672      1984
+672      2064
-784      1640
+784      1704
-896      2056
+896      2176
-1008      1136
+1008      1184
-1120       936
+1120       976
-1232      1240
+1232      1280
-1344      1112
+1344      1176
-1456       608
+1456       624
-1568       873
+1568       912
-1680       492
+1680       504
-1792       444
+1792       452
-1904       640
+1904       658
-2016       584
+2016       608
-2128       328
+2128       336
-2240       307
+2240       312
-2352       283
+2352       288
-2464       256
+2464       264
-2576       393
+2576       408
-2688       365
+2688       376
-2800       344
+2800       354
-2912       196
+2912       198
-3024       301
+3024       307
-3136       170
+3136       173
-3248       160
+3248       162
-3360       250
+3360       256
-3472       144
+3472       145
-3584       224
+3584       226
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/k7/README
+++ b/logs/k7/README
@ -0,0 +1,13 @@
 To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
 Todo this type 
 make timing ; ltmtest
 in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
 After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
 them all :-)
 Have fun
 Tom
--- a/logs/k7/add.log
+++ b/logs/k7/add.log
@ -0,0 +1,16 @@
 224  11069160
 448   9156136
 672   8089755
 896   7399424
 1120   6389352
 1344   5818648
 1568   5257112
 1792   4982160
 2016   4527856
 2240   4325312
 2464   4051760
 2688   3767640
 2912   3612520
 3136   3415208
 3360   3258656
 3584   3113360
--- a/logs/k7/addsub.png
+++ b/logs/k7/addsub.png
--- a/logs/k7/expt.log
+++ b/logs/k7/expt.log
@ -0,0 +1,7 @@
 513       664
 769       256
 1025       117
 2049        17
 2561         9
 3073         5
 4097         2
--- a/logs/k7/expt.png
+++ b/logs/k7/expt.png
--- a/logs/k7/expt_dr.log
+++ b/logs/k7/expt_dr.log
@ -0,0 +1,7 @@
 532      1088
 784       460
 1036       240
 1540        92
 2072        43
 3080        15
 4116         6
--- a/logs/k7/graphs.dem
+++ b/logs/k7/graphs.dem
@ -0,0 +1,17 @@
 set terminal png color
 set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
 set output "addsub.png"
 plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
 set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
 set output "expt.png"
 plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/k7/index.html
+++ b/logs/k7/index.html
@ -0,0 +1,24 @@
 <html>
 <head>
 <title>LibTomMath Log Plots</title>
 </head>
 <body>
 <h1>Addition and Subtraction</h1>
 <center><img src=addsub.png></center>
 <hr>
 <h1>Multipliers</h1>
 <center><img src=mult.png></center>
 <hr>
 <h1>Exptmod</h1>
 <center><img src=expt.png></center>
 <hr>
 <h1>Modular Inverse</h1>
 <center><img src=invmod.png></center>
 <hr>
 </body>
 </html>
--- a/logs/k7/invmod.log
+++ b/logs/k7/invmod.log
@ -0,0 +1,32 @@
 112     16248
 224      8192
 336      5320
 448      3560
 560      2728
 672      2064
 784      1704
 896      2176
 1008      1184
 1120       976
 1232      1280
 1344      1176
 1456       624
 1568       912
 1680       504
 1792       452
 1904       658
 2016       608
 2128       336
 2240       312
 2352       288
 2464       264
 2576       408
 2688       376
 2800       354
 2912       198
 3024       307
 3136       173
 3248       162
 3360       256
 3472       145
 3584       226
--- a/logs/k7/invmod.png
+++ b/logs/k7/invmod.png
--- a/logs/k7/mult.log
+++ b/logs/k7/mult.log
@ -0,0 +1,17 @@
 896    322904
 1344    151592
 1792     90472
 2240     59984
 2688     42624
 3136     31872
 3584     24704
 4032     19704
 4480     16096
 4928     13376
 5376     11272
 5824      9616
 6272      8360
 6720      7304
 7168      1664
 7616      1472
 8064      1328
--- a/logs/k7/mult.png
+++ b/logs/k7/mult.png
--- a/logs/k7/mult_kara.log
+++ b/logs/k7/mult_kara.log
@ -0,0 +1,17 @@
 896    322872
 1344    151688
 1792     90480
 2240     59984
 2688     42656
 3136     32144
 3584     25840
 4032     21328
 4480     17856
 4928     14928
 5376     12856
 5824     11256
 6272      9880
 6720      8984
 7168      7928
 7616      7200
 8064      6576
--- a/logs/k7/sqr.log
+++ b/logs/k7/sqr.log
@ -0,0 +1,17 @@
 896    415472
 1344    223736
 1792    141232
 2240     97624
 2688     71400
 3136     54800
 3584     16904
 4032     13528
 4480     10968
 4928      9128
 5376      7784
 5824      6672
 6272      5760
 6720      5056
 7168      4440
 7616      3952
 8064      3512
--- a/logs/k7/sqr_kara.log
+++ b/logs/k7/sqr_kara.log
@ -0,0 +1,17 @@
 896    420464
 1344    224800
 1792    142808
 2240     97704
 2688     71416
 3136     54504
 3584     38320
 4032     32360
 4480     27576
 4928     23840
 5376     20688
 5824     18264
 6272     16176
 6720     14440
 7168     11688
 7616     10752
 8064      9936
--- a/logs/k7/sub.log
+++ b/logs/k7/sub.log
@ -0,0 +1,16 @@
 224   9728504
 448   8573648
 672   7488096
 896   6714064
 1120   5950472
 1344   5457400
 1568   5038896
 1792   4683632
 2016   4384656
 2240   4105976
 2464   3871608
 2688   3650680
 2912   3463552
 3136   3290016
 3360   3135272
 3584   2993848
--- a/logs/mult.log
+++ b/logs/mult.log
@ -1,17 +1,17 @@
-896    321504
+896    322904
-1344    150784
+1344    151592
-1792     90288
+1792     90472
-2240     59760
+2240     59984
-2688     42480
+2688     42624
-3136     32056
+3136     31872
-3584     24600
+3584     24704
-4032     19656
+4032     19704
-4480     16024
+4480     16096
-4928     13328
+4928     13376
-5376     11280
+5376     11272
-5824      9624
+5824      9616
-6272      8336
+6272      8360
-6720      7280
+6720      7304
-7168      1648
+7168      1664
-7616      1464
+7616      1472
-8064      1296
+8064      1328
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,17 +1,17 @@
-896    321928
+896    322872
-1344    150752
+1344    151688
-1792     90136
+1792     90480
-2240     59888
+2240     59984
-2688     42480
+2688     42656
-3136     32080
+3136     32144
-3584     25744
+3584     25840
-4032     21216
+4032     21328
-4480     17912
+4480     17856
-4928     14896
+4928     14928
-5376     12936
+5376     12856
-5824     11216
+5824     11256
-6272      9848
+6272      9880
-6720      8896
+6720      8984
-7168      7968
+7168      7928
-7616      7248
+7616      7200
-8064      6600
+8064      6576
--- a/logs/p4/README
+++ b/logs/p4/README
@ -0,0 +1,13 @@
 To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
 Todo this type 
 make timing ; ltmtest
 in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
 After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
 them all :-)
 Have fun
 Tom
--- a/logs/p4/add.log
+++ b/logs/p4/add.log
@ -0,0 +1,16 @@
 224   8113248
 448   6585584
 672   5687678
 896   4761144
 1120   4111592
 1344   3995154
 1568   3532387
 1792   3225400
 2016   2963960
 2240   2720112
 2464   2533952
 2688   2307168
 2912   2287064
 3136   2150160
 3360   2035992
 3584   1936304
--- a/logs/p4/addsub.png
+++ b/logs/p4/addsub.png
--- a/logs/p4/expt.log
+++ b/logs/p4/expt.log
@ -0,0 +1,7 @@
 513       195
 769        68
 1025        31
 2049         4
 2561         2
 3073         1
 4097         0
--- a/logs/p4/expt.png
+++ b/logs/p4/expt.png
--- a/logs/p4/expt_dr.log
+++ b/logs/p4/expt_dr.log
@ -0,0 +1,7 @@
 532       393
 784       158
 1036        79
 1540        27
 2072        12
 3080         4
 4116         1
--- a/logs/p4/graphs.dem
+++ b/logs/p4/graphs.dem
@ -0,0 +1,17 @@
 set terminal png color
 set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
 set output "addsub.png"
 plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
 set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
 set output "expt.png"
 plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/p4/index.html
+++ b/logs/p4/index.html
@ -0,0 +1,24 @@
 <html>
 <head>
 <title>LibTomMath Log Plots</title>
 </head>
 <body>
 <h1>Addition and Subtraction</h1>
 <center><img src=addsub.png></center>
 <hr>
 <h1>Multipliers</h1>
 <center><img src=mult.png></center>
 <hr>
 <h1>Exptmod</h1>
 <center><img src=expt.png></center>
 <hr>
 <h1>Modular Inverse</h1>
 <center><img src=invmod.png></center>
 <hr>
 </body>
 </html>
--- a/logs/p4/invmod.log
+++ b/logs/p4/invmod.log
@ -0,0 +1,32 @@
 112     13608
 224      6872
 336      4264
 448      2792
 560      2144
 672      1560
 784      1296
 896      1672
 1008       896
 1120       736
 1232      1024
 1344       888
 1456       472
 1568       680
 1680       373
 1792       328
 1904       484
 2016       436
 2128       232
 2240       211
 2352       200
 2464       177
 2576       293
 2688       262
 2800       251
 2912       137
 3024       216
 3136       117
 3248       113
 3360       181
 3472        98
 3584       158
--- a/logs/p4/invmod.png
+++ b/logs/p4/invmod.png
--- a/logs/p4/mult.log
+++ b/logs/p4/mult.log
@ -0,0 +1,17 @@
 896     77600
 1344     35776
 1792     19688
 2240     13248
 2688      9424
 3136      7056
 3584      5464
 4032      4368
 4480      3568
 4928      2976
 5376      2520
 5824      2152
 6272      1872
 6720      1632
 7168       650
 7616       576
 8064       515
--- a/logs/p4/mult.png
+++ b/logs/p4/mult.png
--- a/logs/p4/mult_kara.log
+++ b/logs/p4/mult_kara.log
@ -0,0 +1,17 @@
 896     77752
 1344     35832
 1792     19688
 2240     14704
 2688     10832
 3136      8336
 3584      6600
 4032      5424
 4480      4648
 4928      3976
 5376      3448
 5824      3016
 6272      2664
 6720      2384
 7168      2120
 7616      1912
 8064      1752
--- a/logs/p4/sqr.log
+++ b/logs/p4/sqr.log
@ -0,0 +1,17 @@
 896    128088
 1344     63640
 1792     37968
 2240     25488
 2688     18176
 3136     13672
 3584      4920
 4032      3912
 4480      3160
 4928      2616
 5376      2216
 5824      1896
 6272      1624
 6720      1408
 7168      1240
 7616      1096
 8064       984
--- a/logs/p4/sqr_kara.log
+++ b/logs/p4/sqr_kara.log
@ -0,0 +1,17 @@
 896    127456
 1344     63752
 1792     37920
 2240     25440
 2688     18200
 3136     13728
 3584     10968
 4032      9072
 4480      7608
 4928      6440
 5376      5528
 5824      4768
 6272      4328
 6720      3888
 7168      3504
 7616      3176
 8064      2896
--- a/logs/p4/sub.log
+++ b/logs/p4/sub.log
@ -0,0 +1,16 @@
 224   7355896
 448   6162880
 672   5218984
 896   4622776
 1120   3999320
 1344   3629480
 1568   3290384
 1792   2954752
 2016   2737056
 2240   2563320
 2464   2451928
 2688   2310920
 2912   2139048
 3136   2034080
 3360   1890800
 3584   1808624
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -1,17 +1,17 @@
-896    416968
+896    415472
-1344    223672
+1344    223736
-1792    141552
+1792    141232
-2240     97280
+2240     97624
-2688     71304
+2688     71400
-3136     54648
+3136     54800
-3584     16264
+3584     16904
-4032     13000
+4032     13528
-4480     10528
+4480     10968
-4928      8776
+4928      9128
-5376      7464
+5376      7784
-5824      6440
+5824      6672
-6272      5520
+6272      5760
-6720      4808
+6720      5056
-7168      4264
+7168      4440
-7616      3784
+7616      3952
-8064      3368
+8064      3512
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,17 +1,17 @@
-896    416656
+896    420464
-1344    223728
+1344    224800
-1792    141288
+1792    142808
-2240     97456
+2240     97704
-2688     71152
+2688     71416
-3136     54392
+3136     54504
-3584     38552
+3584     38320
-4032     32216
+4032     32360
-4480     27384
+4480     27576
-4928     23792
+4928     23840
-5376     20728
+5376     20688
-5824     18232
+5824     18264
-6272     16160
+6272     16176
-6720     14408
+6720     14440
-7168     11696
+7168     11688
-7616     10768
+7616     10752
-8064      9920
+8064      9936
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224   9862520
+224   9728504
-448   8562344
+448   8573648
-672   7661400
+672   7488096
-896   6838128
+896   6714064
-1120   5911144
+1120   5950472
-1344   5394040
+1344   5457400
-1568   4993760
+1568   5038896
-1792   4624240
+1792   4683632
-2016   4332024
+2016   4384656
-2240   4029312
+2240   4105976
-2464   3790784
+2464   3871608
-2688   3587216
+2688   3650680
-2912   3397952
+2912   3463552
-3136   3239736
+3136   3290016
-3360   3080616
+3360   3135272
-3584   2933104
+3584   2993848
--- a/16
+++ b/16
@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
-VERSION=0.17
+VERSION=0.18
 default: libtommath.a
@ -33,7 +33,9 @@ bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
 bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \
-bn_mp_dr_is_modulus.o bn_mp_dr_setup.o
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -63,6 +65,11 @@ docdvi: tommath.src
 	makeindex tommath
 	latex tommath > /dev/null
 # poster, makes the single page PDF poster
 poster: poster.tex
 	pdflatex poster
 	rm -f poster.aux poster.log 
 # makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
 docs:	
 	cd pics ; make pdfes
@ -88,11 +95,12 @@ manual:
 clean:
 	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c 
+        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c \
        poster.aux poster.dvi poster.log
 	cd etc ; make clean
 	cd pics ; make clean
-zipup: clean manual
+zipup: clean manual poster
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
--- a/makefile.bcc
+++ b/makefile.bcc
@ -0,0 +1,37 @@
 #
 # Borland C++Builder Makefile (makefile.bcc)
 #
 LIB = tlib
 CC = bcc32
 CFLAGS = -c -O2 -I.
 OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
 bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
 bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
 bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
 bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
 bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
 bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
 bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
 bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
 bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
 bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
 bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
 bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
 bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
 bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
 bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
 TARGET = libtommath.lib
 $(TARGET): $(OBJECTS)
 .c.objbj:
 	$(CC) $(CFLAGS) $<
 	$(LIB) $(TARGET) -+$@
--- a/makefile.msvc
+++ b/makefile.msvc
@ -23,7 +23,10 @@ bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
 bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
-bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
 bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
 library: $(OBJECTS)
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -40,14 +40,10 @@ void rand_num(mp_int *a)
   int n, size;
   unsigned char buf[2048];
-top:
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 1031;
   size = 1 + ((fgetc(rng)*fgetc(rng)) % 1024);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
+   while (buf[1] == 0) buf[1] = fgetc(rng);
       if (buf[n+1]) break;
   }
   if (n == size) goto top;
   mp_read_raw(a, buf, 1+size);
 }
@ -56,14 +52,10 @@ void rand_num2(mp_int *a)
   int n, size;
   unsigned char buf[2048];
-top:
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 97;
   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
+   while (buf[1] == 0) buf[1] = fgetc(rng);
       if (buf[n+1]) break;
   }
   if (n == size) goto top;
   mp_read_raw(a, buf, 1+size);
 }
@ -73,6 +65,7 @@ int main(void)
 {
   int n;
   mp_int a, b, c, d, e;
   clock_t t1;
   char buf[4096];
   mp_init(&a);
@ -108,8 +101,14 @@ int main(void)
      }
   }
   t1 = clock();
   for (;;) {
-       n =  fgetc(rng) % 13;
+      if (clock() - t1 > CLOCKS_PER_SEC) {
         sleep(1);
         t1 = clock();
      }
       n = fgetc(rng) % 13;
   if (n == 0) {
       /* add tests */
@ -227,6 +226,7 @@ int main(void)
      rand_num2(&a);
      rand_num2(&b);
      rand_num2(&c);
 //      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
      a.sign = b.sign = c.sign = 0;
      mp_exptmod(&a, &b, &c, &d);
      printf("expt\n");
--- a/pics/expt_state.sxd
+++ b/pics/expt_state.sxd
--- a/Show More
+++ b/Show More
		`@ -0,0 +1,2 @@`
							`256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823`
							`512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979`
-666
+680
-253
+257
 117
 17
 9
 5
 2
+736
+552
+112
+33
+13
+6
-1088
+1064
-468
+460
-244
+240
 91
 43
 15
 6
-15608
+16248
-7840
+8192
-5104
+5320
-3376
+3560
-2616
+2728
-1984
+2064
-1640
+1704
-2056
+2176
-1136
+1184
-936
+976
-1240
+1280
-1112
+1176
-608
+624
-873
+912
-492
+504
-444
+452
-640
+658
-584
+608
-328
+336
-307
+312
-283
+288
-256
+264
-393
+408
-365
+376
-344
+354
-196
+198
-301
+307
-170
+173
-160
+162
-250
+256
-144
+145
-224
+226
+664
+256
+117
+17
+9
+5
+2
+1088
+460
+240
+92
+43
+15
+6
+16248
+8192
+5320
+3560
+2728
+2064
+1704
+2176
+1184
+976
+1280
+1176
+624
+912
+504
+452
+658
+608
+336
+312
+288
+264
+408
+376
+354
+198
+307
+173
+162
+256
+145
+226
+322904
+151592
+90472
+59984
+42624
+31872
+24704
+19704
+16096
+13376
+11272
+9616
+8360
+7304
+1664
+1472
+1328
+322872
+151688
+90480
+59984
+42656
+32144
+25840
+21328
+17856
+14928
+12856
+11256
+9880
+8984
+7928
+7200
+6576