added libtommath-0.18

2003-05-29 13:35:26 +00:00 · 2003-05-29 13:35:26 +00:00 · 0ef44cea9b
commit 0ef44cea9b
parent fd181cc841
108 changed files with 9777 additions and 2427 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}

-\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.18 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* computes xR^-1 == x (mod N) via Montgomery Reduction 
+/* computes xR**-1 == x (mod N) via Montgomery Reduction 
 * 
 * This is an optimized implementation of mp_montgomery_reduce 
 * which uses the comba method to quickly calculate the columns of the
@ -23,76 +23,77 @@
 * Based on Algorithm 14.32 on pp.601 of HAC.
 */
 int
-fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
  int     ix, res, olduse;
  mp_word W[MP_WARRAY];

  /* get old used count */
-  olduse = a->used;
+  olduse = x->used;

  /* grow a as required */
-  if (a->alloc < m->used + 1) {
-    if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) {
+  if (x->alloc < n->used + 1) {
+    if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) {
      return res;
    }
  }

  {
    register mp_word *_W;
-    register mp_digit *tmpa;
+    register mp_digit *tmpx;

    _W = W;
-    tmpa = a->dp;
+    tmpx = x->dp;

    /* copy the digits of a into W[0..a->used-1] */
-    for (ix = 0; ix < a->used; ix++) {
-      *_W++ = *tmpa++;
+    for (ix = 0; ix < x->used; ix++) {
+      *_W++ = *tmpx++;
    }

    /* zero the high words of W[a->used..m->used*2] */
-    for (; ix < m->used * 2 + 1; ix++) {
+    for (; ix < n->used * 2 + 1; ix++) {
      *_W++ = 0;
    }
  }

-  for (ix = 0; ix < m->used; ix++) {
-    /* ui = ai * m' mod b
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b
     *
     * We avoid a double precision multiplication (which isn't required)
-     * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
-     * the carry cleared (see after the inner loop)
+     * by casting the value down to a mp_digit.  Note this requires 
+     * that W[ix-1] have  the carry cleared (see after the inner loop)
     */
-    register mp_digit ui;
-    ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK;
+    register mp_digit mu;
+    mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;

-    /* a = a + ui * m * b^i
+    /* a = a + mu * m * b**i
     *
     * This is computed in place and on the fly.  The multiplication
-     * by b^i is handled by offseting which columns the results
+     * by b**i is handled by offseting which columns the results
     * are added to.
     *
-     * Note the comba method normally doesn't handle carries in the inner loop
-     * In this case we fix the carry from the previous column since the Montgomery
-     * reduction requires digits of the result (so far) [see above] to work.  This is
-     * handled by fixing up one carry after the inner loop.  The carry fixups are done
-     * in order so after these loops the first m->used words of W[] have the carries
-     * fixed
+     * Note the comba method normally doesn't handle carries in the 
+     * inner loop In this case we fix the carry from the previous 
+     * column since the Montgomery reduction requires digits of the 
+     * result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The 
+     * carry fixups are done in order so after these loops the 
+     * first m->used words of W[] have the carries fixed
     */
    {
      register int iy;
-      register mp_digit *tmpx;
+      register mp_digit *tmpn;
      register mp_word *_W;

      /* alias for the digits of the modulus */
-      tmpx = m->dp;
+      tmpn = n->dp;

      /* Alias for the columns set by an offset of ix */
      _W = W + ix;

      /* inner loop */
-      for (iy = 0; iy < m->used; iy++) {
-    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+      for (iy = 0; iy < n->used; iy++) {
+          *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
      }
    }

@ -102,44 +103,44 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)


  {
-    register mp_digit *tmpa;
+    register mp_digit *tmpx;
    register mp_word *_W, *_W1;

    /* nox fix rest of carries */
    _W1 = W + ix;
    _W = W + ++ix;

-    for (; ix <= m->used * 2 + 1; ix++) {
+    for (; ix <= n->used * 2 + 1; ix++) {
      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
    }

-    /* copy out, A = A/b^n
+    /* copy out, A = A/b**n
     *
-     * The result is A/b^n but instead of converting from an array of mp_word
-     * to mp_digit than calling mp_rshd we just copy them in the right
-     * order
+     * The result is A/b**n but instead of converting from an 
+     * array of mp_word to mp_digit than calling mp_rshd 
+     * we just copy them in the right order
     */
-    tmpa = a->dp;
-    _W = W + m->used;
+    tmpx = x->dp;
+    _W = W + n->used;

-    for (ix = 0; ix < m->used + 1; ix++) {
-      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+    for (ix = 0; ix < n->used + 1; ix++) {
+      *tmpx++ = *_W++ & ((mp_word) MP_MASK);
    }

    /* zero oldused digits, if the input a was larger than
     * m->used+1 we'll have to clear the digits */
    for (; ix < olduse; ix++) {
-      *tmpa++ = 0;
+      *tmpx++ = 0;
    }
  }

  /* set the max used and clamp */
-  a->used = m->used + 1;
-  mp_clamp (a);
+  x->used = n->used + 1;
+  mp_clamp (x);

  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
-    return s_mp_sub (a, m, a);
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
  }
  return MP_OKAY;
 }
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@ -16,15 +16,17 @@

 /* fast squaring
 *
- * This is the comba method where the columns of the product are computed first
- * then the carries are computed.  This has the effect of making a very simple
- * inner loop that is executed the most
+ * This is the comba method where the columns of the product 
+ * are computed first then the carries are computed.  This 
+ * has the effect of making a very simple inner loop that 
+ * is executed the most
 *
 * W2 represents the outer products and W the inner.
 *
- * A further optimizations is made because the inner products are of the form
- * "A * B * 2".  The *2 part does not need to be computed until the end which is
- * good because 64-bit shifts are slow!
+ * A further optimizations is made because the inner 
+ * products are of the form "A * B * 2".  The *2 part does 
+ * not need to be computed until the end which is good 
+ * because 64-bit shifts are slow!
 *
 * Based on Algorithm 14.16 on pp.597 of HAC.
 *
@ -48,26 +50,15 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
   * Note that there are two buffers.  Since squaring requires
   * a outter and inner product and the inner product requires
   * computing a product and doubling it (a relatively expensive
-   * op to perform n^2 times if you don't have to) the inner and
+   * op to perform n**2 times if you don't have to) the inner and
   * outer products are computed in different buffers.  This way
   * the inner product can be doubled using n doublings instead of
-   * n^2
+   * n**2
   */
  memset (W, 0, newused * sizeof (mp_word));
  memset (W2, 0, newused * sizeof (mp_word));

-/* note optimization
- * values in W2 are only written in even locations which means
- * we can collapse the array to 256 words [and fixup the memset above]
- * provided we also fix up the summations below.  Ideally
- * the fixup loop should be unrolled twice to handle the even/odd
- * cases, and then a final step to handle odd cases [e.g. newused == odd]
- *
- * This will not only save ~8*256 = 2KB of stack but lower the number of
- * operations required to finally fix up the columns
- */
-
-  /* This computes the inner product.  To simplify the inner N^2 loop
+  /* This computes the inner product.  To simplify the inner N**2 loop
   * the multiplication by two is done afterwards in the N loop.
   */
  for (ix = 0; ix < pa; ix++) {
@ -101,18 +92,19 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
  }

  /* setup dest */
-  olduse = b->used;
+  olduse  = b->used;
  b->used = newused;

-  /* double first value, since the inner products are half of what they should be */
-  W[0] += W[0] + W2[0];
-
  /* now compute digits */
  {
    register mp_digit *tmpb;

-    tmpb = b->dp;
+    /* double first value, since the inner products are 
+     * half of what they should be 
+     */
+    W[0] += W[0] + W2[0];

+    tmpb = b->dp;
    for (ix = 1; ix < newused; ix++) {
      /* double/add next digit */
      W[ix] += W[ix] + W2[ix];
@ -120,9 +112,13 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
    }
+    /* set the last value.  Note even if the carry is zero 
+     * this is required since the next step will not zero 
+     * it if b originally had a value at b->dp[2*a.used]
+     */
    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));

-    /* clear high */
+    /* clear high digits */
    for (; ix < olduse; ix++) {
      *tmpb++ = 0;
    }
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* computes a = 2^b 
+/* computes a = 2**b 
 *
 * Simple algorithm which zeroes the int, grows it then just sets one bit
 * as required.
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@ -21,7 +21,7 @@ mp_copy (mp_int * a, mp_int * b)
  int     res, n;

  /* if dst == src do nothing */
-  if (a == b || a->dp == b->dp) {
+  if (a == b) {
    return MP_OKAY;
  }

--- a/bn_mp_count_bits.c
+++ b/bn_mp_count_bits.c
@ -21,11 +21,15 @@ mp_count_bits (mp_int * a)
  int     r;
  mp_digit q;

+  /* shortcut */
  if (a->used == 0) {
    return 0;
  }

+  /* get number of digits and add that */
  r = (a->used - 1) * DIGIT_BIT;
+  
+  /* take the last digit and count the bits in it */
  q = a->dp[a->used - 1];
  while (q > ((mp_digit) 0)) {
    ++r;
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* shift right by a certain bit count (store quotient in c, remainder in d) */
+/* shift right by a certain bit count (store quotient in c, optional remainder in d) */
 int
 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 {
@ -81,7 +81,6 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
    }
  }
  mp_clamp (c);
-  res = MP_OKAY;
  if (d != NULL) {
    mp_exch (&t, d);
  }
--- a/bn_mp_div_3.c
+++ b/bn_mp_div_3.c
@ -0,0 +1,64 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* divide by three (based on routine from MPI and the GMP manual) */
+int
+mp_div_3 (mp_int * a, mp_int *c, mp_digit * d)
+{
+  mp_int   q;
+  mp_word  w, t;
+  mp_digit b;
+  int      res, ix;
+  
+  /* b = 2**DIGIT_BIT / 3 */
+  b = (((mp_word)1) << ((mp_word)DIGIT_BIT)) / ((mp_word)3);
+
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= 3) {
+        t = (w * ((mp_word)b)) >> ((mp_word)DIGIT_BIT);
+        w -= (t << ((mp_word)1)) + t;
+        while (w >= 3) {
+           t += 1;
+           w -= 3;
+        }
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = t;
+  }
+  
+  if (d != NULL) {
+     *d = w;
+  }
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
+  return res;
+}
+
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@ -14,31 +14,51 @@
 */
 #include <tommath.h>

-/* single digit division */
+/* single digit division (based on routine from MPI) */
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 {
-  mp_int  t, t2;
-  int     res;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
+  mp_int  q;
+  mp_word w, t;
+  int     res, ix;
+  
+  if (b == 0) {
+     return MP_VAL;
  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
+  
+  if (b == 3) {
+     return mp_div_3(a, c, d);
  }
-
-  mp_set (&t, b);
-  res = mp_div (a, &t, c, &t2);
-
-  /* set remainder if not null */
+  
+  if ((res = mp_init_size(&q, a->used)) != MP_OKAY) {
+     return res;
+  }
+  
+  q.used = a->used;
+  q.sign = a->sign;
+  w = 0;
+  for (ix = a->used - 1; ix >= 0; ix--) {
+     w = (w << ((mp_word)DIGIT_BIT)) | ((mp_word)a->dp[ix]);
+     
+     if (w >= b) {
+        t = w / b;
+        w = w % b;
+      } else {
+        t = 0;
+      }
+      q.dp[ix] = t;
+  }
+  
  if (d != NULL) {
-    *d = t2.dp[0];
+     *d = w;
  }
-
-  mp_clear (&t);
-  mp_clear (&t2);
+  
+  if (c != NULL) {
+     mp_clamp(&q);
+     mp_exch(&q, c);
+  }
+  mp_clear(&q);
+  
  return res;
 }
+
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
+/* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
 *
 * Based on algorithm from the paper
 *
@ -23,107 +23,64 @@
 *          POSTECH Information Research Laboratories
 *
 * The modulus must be of a special format [see manual]
+ *
+ * Has been modified to use algorithm 7.10 from the LTM book instead
 */
 int
-mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
+mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
 {
-  int     err, i, j, k;
-  mp_word r;
-  mp_digit mu, *tmpj, *tmpi;
-
-  /* k = digits in modulus */
-  k = b->used;
-
-  /* ensure that "a" has at least 2k digits */
-  if (a->alloc < k + k) {
-    if ((err = mp_grow (a, k + k)) != MP_OKAY) {
+  int      err, i, m;
+  mp_word  r;
+  mp_digit mu, *tmpx1, *tmpx2;
+  
+  /* m = digits in modulus */
+  m = n->used;
+  
+  /* ensure that "x" has at least 2m digits */
+  if (x->alloc < m + m) {
+    if ((err = mp_grow (x, m + m)) != MP_OKAY) {
      return err;
    }
  }

-  /* alias for a->dp[i] */
-  tmpi = a->dp + k + k - 1;
-
-  /* for (i = 2k - 1; i >= k; i = i - 1)
-   *
-   * This is the main loop of the reduction.  Note that at the end
-   * the words above position k are not zeroed as expected.  The end
-   * result is that the digits from 0 to k-1 are the residue.  So
-   * we have to clear those afterwards.
-   */
-  for (i = k + k - 1; i >= k; i = i - 1) {
-    /* x[i - 1 : i - k] += x[i]*mp */
-
-    /* x[i] * mp */
-    r = ((mp_word) *tmpi--) * ((mp_word) mp);
-
-    /* now add r to x[i-1:i-k]
-     *
-     * First add it to the first digit x[i-k] then form the carry
-     * then enter the main loop
-     */
-    j = i - k;
-
-    /* alias for a->dp[j] */
-    tmpj = a->dp + j;
-
-    /* add digit */
-    *tmpj += (mp_digit)(r & MP_MASK);
-
-    /* this is the carry */
-    mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);
-
-    /* clear carry from a->dp[j]  */
-    *tmpj++ &= MP_MASK;
-
-    /* now add rest of the digits
-     *
-     * Note this is basically a simple single digit addition to
-     * a larger multiple digit number.  This is optimized somewhat
-     * because the propagation of carries is not likely to move
-     * more than a few digits.
-     *
-     */
-    for (++j; mu != 0 && j <= (i - 1); ++j) {
-      *tmpj   += mu;
-      mu       = *tmpj >> DIGIT_BIT;
-      *tmpj++ &= MP_MASK;
-    }
-
-    /* if final carry */
-    if (mu != 0) {
-      /* add mp to this to correct */
-      j = i - k;
-      tmpj = a->dp + j;
-
-      *tmpj += mp;
-      mu = *tmpj >> DIGIT_BIT;
-      *tmpj++ &= MP_MASK;
-
-      /* now handle carries */
-      for (++j; mu != 0 && j <= (i - 1); j++) {
-          *tmpj   += mu;
-          mu       = *tmpj >> DIGIT_BIT;
-          *tmpj++ &= MP_MASK;
-      }
-    }
+/* top of loop, this is where the code resumes if 
+ * another reduction pass is required.
+ */
+top:
+  /* aliases for digits */
+  /* alias for lower half of x */
+  tmpx1 = x->dp;
+  
+  /* alias for upper half of x, or x/B**m */
+  tmpx2 = x->dp + m;
+  
+  /* set carry to zero */
+  mu = 0;
+  
+  /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
+  for (i = 0; i < m; i++) {
+      r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
+      *tmpx1++  = r & MP_MASK;
+      mu        = r >> ((mp_word)DIGIT_BIT);
  }
-
-  /* zero words above k */
-  tmpi = a->dp + k;
-  for (i = k; i < a->used; i++) {
-      *tmpi++ = 0;
+  
+  /* set final carry */
+  *tmpx1++ = mu;
+  
+  /* zero words above m */
+  for (i = m + 1; i < x->used; i++) {
+      *tmpx1++ = 0;
  }

  /* clamp, sub and return */
-  mp_clamp (a);
+  mp_clamp (x);

-  /* if a >= b [b == modulus] then subtract the modulus to fix up */
-  if (mp_cmp_mag (a, b) != MP_LT) {
-    return s_mp_sub (a, b, a);
+  /* if x >= n then subtract and reduce again 
+   * Each successive "recursion" makes the input smaller and smaller.
+   */
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    s_mp_sub(x, n, x);
+    goto top;
  }
  return MP_OKAY;
 }
-
-
-
--- a/bn_mp_dr_setup.c
+++ b/bn_mp_dr_setup.c
@ -20,6 +20,7 @@ void mp_dr_setup(mp_int *a, mp_digit *d)
   /* the casts are required if DIGIT_BIT is one less than
    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
    */
-   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0]));
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
+        ((mp_word)a->dp[0]));
 }

--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@ -14,7 +14,7 @@
 */
 #include <tommath.h>

-/* calculate c = a^b  using a square-multiply algorithm */
+/* calculate c = a**b  using a square-multiply algorithm */
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@ -14,7 +14,6 @@
 */
 #include <tommath.h>

-static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);

 /* this is a shell function that calls either the normal or Montgomery
 * exptmod functions.  Originally the call to the montgomery code was
@ -55,212 +54,22 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
        return err;
     }

-     /* and now compute (1/G)^|X| instead of G^X [X < 0] */
+     /* and now compute (1/G)**|X| instead of G**X [X < 0] */
     err = mp_exptmod(&tmpG, &tmpX, P, Y);
     mp_clear_multi(&tmpG, &tmpX, NULL);
     return err;
  }

-
  dr = mp_dr_is_modulus(P);
+  if (dr == 0) {
+     dr = mp_reduce_is_2k(P) << 1;
+  }
+  
  /* if the modulus is odd use the fast method */
-  if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) {
+  if ((mp_isodd (P) == 1 || dr !=  0) && P->used > 4) {
    return mp_exptmod_fast (G, X, P, Y, dr);
  } else {
-    return f_mp_exptmod (G, X, P, Y);
+    return s_mp_exptmod (G, X, P, Y);
  }
 }

-static int
-f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  mp_int  M[256], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-#ifdef MP_LOW_MEM
-    if (winsize > 5) {
-       winsize = 5;
-    }
-#endif
-
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-        mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-   *
-   * The first half of the table is not computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode   = 0;
-  bitcnt = 1;
-  buf    = 0;
-  digidx = X->used - 1;
-  bitcpy = bitbuf = 0;
-
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-        break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-    buf <<= (mp_digit)1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-        goto __MU;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __MU;
-      }
-
-      /* empty window and reset */
-      bitcpy = bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-        goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-        goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-        /* then multiply */
-        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-          goto __RES;
-        }
-        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-          goto __RES;
-        }
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@ -27,6 +27,11 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  mp_int  M[256], res;
  mp_digit buf, mp;
  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  
+  /* use a pointer to the reduction algorithm.  This allows us to use
+   * one of many reduction algorithms without modding the guts of
+   * the code with if statements everywhere.  
+   */
  int     (*redux)(mp_int*,mp_int*,mp_digit);

  /* find window size */
@ -64,6 +69,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
    }
  }

+  /* determine and setup reduction code */
  if (redmode == 0) {
     /* now setup montgomery  */
     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
@ -71,17 +77,23 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     }
     
     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
-     if ( ((P->used * 2 + 1) < MP_WARRAY) &&
+     if (((P->used * 2 + 1) < MP_WARRAY) &&
          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
        redux = fast_mp_montgomery_reduce;
     } else {
        /* use slower baselien method */
        redux = mp_montgomery_reduce;
     }
-  } else {
+  } else if (redmode == 1) {
     /* setup DR reduction */
     mp_dr_setup(P, &mp);
     redux = mp_dr_reduce;
+  } else {
+     /* setup 2k reduction */
+     if ((err = mp_reduce_2k_setup(P, &mp)) != MP_OKAY) {
+        goto __M;
+     }
+     redux = mp_reduce_2k;
  }

  /* setup result */
@ -142,7 +154,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  bitcnt = 1;
  buf    = 0;
  digidx = X->used - 1;
-  bitcpy = bitbuf = 0;
+  bitcpy = 0;
+  bitbuf = 0;

  for (;;) {
    /* grab next digit as required */
@ -203,7 +216,8 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      }

      /* empty window and reset */
-      bitcpy = bitbuf = 0;
+      bitcpy = 0;
+      bitbuf = 0;
      mode = 1;
    }
  }
@ -233,7 +247,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
  }

  if (redmode == 0) {
-     /* fixup result */
+     /* fixup result if Montgomery reduction is used */
     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
       goto __RES;
     }
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -24,7 +24,7 @@ mp_init (mp_int * a)
    return MP_MEM;
  }

-  /* set the used to zero, allocated digit to the default precision
+  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
  a->alloc = MP_PREC;
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -14,24 +14,34 @@
 */
 #include <tommath.h>

-/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications
+/* c = |a| * |b| using Karatsuba Multiplication using 
+ * three half size multiplications
 *
- * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b)
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
+ * let n represent half of the number of digits in 
+ * the min(a,b)
 *
- * a = a1 * B^n + a0
- * b = b1 * B^n + b0
+ * a = a1 * B**n + a0
+ * b = b1 * B**n + b0
 *
- * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0
+ * Then, a * b => 
+   a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
 *
- * Note that a1b1 and a0b0 are used twice and only need to be computed once.  So in total
- * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0)
+ * Note that a1b1 and a0b0 are used twice and only need to be 
+ * computed once.  So in total three half size (half # of 
+ * digit) multiplications are performed, a0b0, a1b1 and 
+ * (a1-b1)(a0-b0)
 *
- * Note that a multiplication of half the digits requires 1/4th the number of single precision 
- * multiplications so in total after one call 25% of the single precision multiplications are saved.
- * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above
- * the threshold.  This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which
- * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use.  Generally though the 
- * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached.
+ * Note that a multiplication of half the digits requires
+ * 1/4th the number of single precision multiplications so in 
+ * total after one call 25% of the single precision multiplications 
+ * are saved.  Note also that the call to mp_mul can end up back 
+ * in this function if the a0, a1, b0, or b1 are above the threshold.  
+ * This is known as divide-and-conquer and leads to the famous 
+ * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
+ * the standard O(N**2) that the baseline/comba methods use.  
+ * Generally though the overhead of this method doesn't pay off 
+ * until a certain size (N ~ 80) is reached.
 */
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
@ -101,14 +111,15 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    }
  }

-  /* only need to clamp the lower words since by definition the upper words x1/y1 must
-   * have a known number of digits
+  /* only need to clamp the lower words since by definition the 
+   * upper words x1/y1 must have a known number of digits
   */
  mp_clamp (&x0);
  mp_clamp (&y0);

  /* now calc the products x0y0 and x1y1 */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  /* after this x0 is no longer required, free temp [x0==t2]! */
+  /* after this x0 is no longer required, free temp [x0==t2]! */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
    goto X1Y1;          /* x0y0 = x0*y0 */
  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
    goto X1Y1;          /* x1y1 = x1*y1 */
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@ -14,10 +14,12 @@
 */
 #include <tommath.h>

-/* Karatsuba squaring, computes b = a*a using three half size squarings
+/* Karatsuba squaring, computes b = a*a using three 
+ * half size squarings
 *
- * See comments of mp_karatsuba_mul for details.  It is essentially the same algorithm
- * but merely tuned to perform recursive squarings.
+ * See comments of mp_karatsuba_mul for details.  It 
+ * is essentially the same algorithm but merely 
+ * tuned to perform recursive squarings.
 */
 int
 mp_karatsuba_sqr (mp_int * a, mp_int * b)
@ -74,32 +76,32 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)

  /* now calc the products x0*x0 and x1*x1 */
  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;                  /* x0x0 = x0*x0 */
+    goto X1X1;           /* x0x0 = x0*x0 */
  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;                  /* x1x1 = x1*x1 */
+    goto X1X1;           /* x1x1 = x1*x1 */

-  /* now calc (x1-x0)^2 */
+  /* now calc (x1-x0)**2 */
  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x1 - x0 */
+    goto X1X1;           /* t1 = x1 - x0 */
  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = (x1 - x0) * (x1 - x0) */
+    goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */

  /* add x0y0 */
  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;                  /* t2 = x0y0 + x1y1 */
+    goto X1X1;           /* t2 = x0x0 + x1x1 */
  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+    goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */

  /* shift by B */
  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;                  /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;                  /* x1y1 = x1y1 << 2*B */
+    goto X1X1;           /* x1x1 = x1x1 << 2*B */

  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + t1 */
+    goto X1X1;           /* t1 = x0x0 + t1 */
  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;                  /* t1 = x0y0 + t1 + x1y1 */
+    goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */

  err = MP_OKAY;

--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@ -33,29 +33,29 @@ mp_lshd (mp_int * a, int b)
  }

  {
-    register mp_digit *tmpa, *tmpaa;
+    register mp_digit *top, *bottom;

-    /* increment the used by the shift amount than copy upwards */
+    /* increment the used by the shift amount then copy upwards */
    a->used += b;

    /* top */
-    tmpa = a->dp + a->used - 1;
+    top = a->dp + a->used - 1;

    /* base */
-    tmpaa = a->dp + a->used - 1 - b;
+    bottom = a->dp + a->used - 1 - b;

    /* much like mp_rshd this is implemented using a sliding window
     * except the window goes the otherway around.  Copying from
     * the bottom to the top.  see bn_mp_rshd.c for more info.
     */
    for (x = a->used - 1; x >= b; x--) {
-      *tmpa-- = *tmpaa--;
+      *top-- = *bottom--;
    }

    /* zero the lower digits */
-    tmpa = a->dp;
+    top = a->dp;
    for (x = 0; x < b; x++) {
-      *tmpa++ = 0;
+      *top++ = 0;
    }
  }
  return MP_OKAY;
--- a/bn_mp_mod_d.c
+++ b/bn_mp_mod_d.c
@ -17,31 +17,5 @@
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
-  mp_int  t, t2;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  mp_set (&t, b);
-  mp_div (a, &t, NULL, &t2);
-
-  if (t2.sign == MP_NEG) {
-    if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) {
-      mp_clear (&t);
-      mp_clear (&t2);
-      return res;
-    }
-  }
-  *c = t2.dp[0];
-  mp_clear (&t);
-  mp_clear (&t2);
-  return MP_OKAY;
+  return mp_div_d(a, b, NULL, c);
 }
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@ -14,12 +14,12 @@
 */
 #include <tommath.h>

-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+/* computes xR**-1 == x (mod N) via Montgomery Reduction */
 int
-mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
 {
  int     ix, res, digs;
-  mp_digit ui;
+  mp_digit mu;

  /* can the fast reduction [comba] method be used?
   *
@ -27,55 +27,60 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
   * than the available columns [255 per default] since carries
   * are fixed up in the inner loop.
   */
-  digs = m->used * 2 + 1;
-  if ((digs < MP_WARRAY)
-      && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_mp_montgomery_reduce (a, m, mp);
+  digs = n->used * 2 + 1;
+  if ((digs < MP_WARRAY) && 
+      n->used < 
+      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_mp_montgomery_reduce (x, n, rho);
  }

  /* grow the input as required */
-  if (a->alloc < m->used * 2 + 1) {
-    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
+  if (x->alloc < digs) {
+    if ((res = mp_grow (x, digs)) != MP_OKAY) {
      return res;
    }
  }
-  a->used = m->used * 2 + 1;
+  x->used = digs;

-  for (ix = 0; ix < m->used; ix++) {
-    /* ui = ai * m' mod b */
-    ui = (a->dp[ix] * mp) & MP_MASK;
+  for (ix = 0; ix < n->used; ix++) {
+    /* mu = ai * m' mod b */
+    mu = (x->dp[ix] * rho) & MP_MASK;

-    /* a = a + ui * m * b^i */
+    /* a = a + mu * m * b**i */
    {
      register int iy;
-      register mp_digit *tmpx, *tmpy, mu;
+      register mp_digit *tmpn, *tmpx, u;
      register mp_word r;

      /* aliases */
-      tmpx = m->dp;
-      tmpy = a->dp + ix;
+      tmpn = n->dp;
+      tmpx = x->dp + ix;

-      mu = 0;
-      for (iy = 0; iy < m->used; iy++) {
-        r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
-        mu = (r >> ((mp_word) DIGIT_BIT));
-        *tmpy++ = (r & ((mp_word) MP_MASK));
+      /* set the carry to zero */
+      u = 0;
+      
+      /* Multiply and add in place */
+      for (iy = 0; iy < n->used; iy++) {
+        r = ((mp_word) mu) * ((mp_word) * tmpn++) + 
+            ((mp_word) u) + ((mp_word) * tmpx);
+        u = (r >> ((mp_word) DIGIT_BIT));
+        *tmpx++ = (r & ((mp_word) MP_MASK));
      }
      /* propagate carries */
-      while (mu) {
-        *tmpy += mu;
-        mu = (*tmpy >> DIGIT_BIT) & 1;
-        *tmpy++ &= MP_MASK;
+      while (u) {
+        *tmpx += u;
+        u = *tmpx >> DIGIT_BIT;
+        *tmpx++ &= MP_MASK;
      }
    }
  }

-  /* A = A/b^n */
-  mp_rshd (a, m->used);
+  /* x = x/b**n.used */
+  mp_rshd (x, n->used);

  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
-    return s_mp_sub (a, m, a);
+  if (mp_cmp_mag (x, n) != MP_LT) {
+    return s_mp_sub (x, n, x);
  }

  return MP_OKAY;
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@ -16,38 +16,38 @@

 /* setups the montgomery reduction stuff */
 int
-mp_montgomery_setup (mp_int * a, mp_digit * mp)
+mp_montgomery_setup (mp_int * n, mp_digit * rho)
 {
  mp_digit x, b;

-/* fast inversion mod 2^k
+/* fast inversion mod 2**k
 *
 * Based on the fact that
 *
- * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
- *                   =>  2*X*A - X*X*A*A = 1
- *                   =>  2*(1) - (1)     = 1
+ * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
+ *                    =>  2*X*A - X*X*A*A = 1
+ *                    =>  2*(1) - (1)     = 1
 */
-  b = a->dp[0];
+  b = n->dp[0];

  if ((b & 1) == 0) {
    return MP_VAL;
  }

-  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */
-  x *= 2 - b * x;               /* here x*a==1 mod 2^8 */
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
 #if !defined(MP_8BIT)
-  x *= 2 - b * x;               /* here x*a==1 mod 2^16; each step doubles the nb of bits */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
 #endif
 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-  x *= 2 - b * x;               /* here x*a==1 mod 2^32 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
 #endif
 #ifdef MP_64BIT
-  x *= 2 - b * x;               /* here x*a==1 mod 2^64 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
 #endif

-  /* t = -1/m mod b */
-  *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+  /* rho = -1/m mod b */
+  *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;

  return MP_OKAY;
 }
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -20,19 +20,24 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
 {
  int     res, neg;
  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
+  
+  if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) {
+    res = mp_toom_mul(a, b, c);
+  } else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) {
    res = mp_karatsuba_mul (a, b, c);
  } else {

    /* can we use the fast multiplier?
     *
-     * The fast multiplier can be used if the output will have less than
-     * MP_WARRAY digits and the number of digits won't affect carry propagation
+     * The fast multiplier can be used if the output will 
+     * have less than MP_WARRAY digits and the number of 
+     * digits won't affect carry propagation
     */
    int     digs = a->used + b->used + 1;

-    if ((digs < MP_WARRAY)
-        && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    if ((digs < MP_WARRAY) &&
+        MIN(a->used, b->used) <= 
+        (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
      res = fast_s_mp_mul_digs (a, b, c, digs);
    } else {
      res = s_mp_mul (a, b, c);
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@ -14,22 +14,8 @@
 */
 #include <tommath.h>

-/* pre-calculate the value required for Barrett reduction
- * For a given modulus "b" it calulates the value required in "a"
- */
-int
-mp_reduce_setup (mp_int * a, mp_int * b)
-{
-  int     res;
-  
-  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
-    return res;
-  }
-  res = mp_div (a, b, a, NULL);
-  return res;
-}
-
-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup
+/* reduces x mod m, assumes 0 < x < m**2, mu is 
+ * precomputed via mp_reduce_setup.
 * From HAC pp.604 Algorithm 14.42
 */
 int
@ -38,11 +24,12 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
  mp_int  q;
  int     res, um = m->used;

+  /* q = x */
  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
    return res;
  }

-  /* q1 = x / b^(k-1)  */
+  /* q1 = x / b**(k-1)  */
  mp_rshd (&q, um - 1);         

  /* according to HAC this is optimization is ok */
@ -56,15 +43,15 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
    }
  }

-  /* q3 = q2 / b^(k+1) */
+  /* q3 = q2 / b**(k+1) */
  mp_rshd (&q, um + 1);         

-  /* x = x mod b^(k+1), quick (no division) */
+  /* x = x mod b**(k+1), quick (no division) */
  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
    goto CLEANUP;
  }

-  /* q = q * m mod b^(k+1), quick (no division) */
+  /* q = q * m mod b**(k+1), quick (no division) */
  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
    goto CLEANUP;
  }
@ -74,7 +61,7 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
    goto CLEANUP;
  }

-  /* If x < 0, add b^(k+1) to it */
+  /* If x < 0, add b**(k+1) to it */
  if (mp_cmp_d (x, 0) == MP_LT) {
    mp_set (&q, 1);
    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
@ -89,7 +76,7 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
      break;
    }
  }
-
+  
 CLEANUP:
  mp_clear (&q);

--- a/bn_mp_reduce_2k.c
+++ b/bn_mp_reduce_2k.c
@ -0,0 +1,56 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reduces a modulo n where n is of the form 2**p - k */
+int
+mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
+{
+   mp_int q;
+   int    p, res;
+   
+   if ((res = mp_init(&q)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(n);    
+top:
+   /* q = a/2**p, a = a mod 2**p */
+   if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (k != 1) {
+      /* q = q * k */
+      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
+         goto ERR;
+      }
+   }
+   
+   /* a = a + q */
+   if ((res = s_mp_add(a, &q, a)) != MP_OKAY) {
+      goto ERR;
+   }
+   
+   if (mp_cmp_mag(a, n) != MP_LT) {
+      s_mp_sub(a, n, a);
+      goto top;
+   }
+   
+ERR:
+   mp_clear(&q);
+   return res;
+}
+
--- a/bn_mp_reduce_2k_setup.c
+++ b/bn_mp_reduce_2k_setup.c
@ -0,0 +1,42 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+int 
+mp_reduce_2k_setup(mp_int *a, mp_digit *d)
+{
+   int res, p;
+   mp_int tmp;
+   
+   if ((res = mp_init(&tmp)) != MP_OKAY) {
+      return res;
+   }
+   
+   p = mp_count_bits(a);
+   if ((res = mp_2expt(&tmp, p)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) {
+      mp_clear(&tmp);
+      return res;
+   }
+   
+   *d = tmp.dp[0];
+   mp_clear(&tmp);
+   return MP_OKAY;
+}
--- a/bn_mp_reduce_is_2k.c
+++ b/bn_mp_reduce_is_2k.c
@ -0,0 +1,37 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if mp_reduce_2k can be used */
+int 
+mp_reduce_is_2k(mp_int *a)
+{
+   int ix, iy;
+   
+   if (a->used == 0) {
+      return 0;
+   } else if (a->used == 1) {
+      return 1;
+   } else if (a->used > 1) {
+      iy = mp_count_bits(a);
+      for (ix = DIGIT_BIT; ix < iy; ix++) {
+          if ((a->dp[ix/DIGIT_BIT] & ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) {
+             return 0;
+          }
+      }
+   }
+   return 1;
+}
+
--- a/bn_mp_reduce_setup.c
+++ b/bn_mp_reduce_setup.c
@ -0,0 +1,29 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int
+mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int     res;
+  
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  return mp_div (a, b, a, NULL);
+}
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@ -32,15 +32,15 @@ mp_rshd (mp_int * a, int b)
  }

  {
-    register mp_digit *tmpa, *tmpaa;
+    register mp_digit *bottom, *top;

    /* shift the digits down */

-    /* base */
-    tmpa = a->dp;
+    /* bottom */
+    bottom = a->dp;

-    /* offset into digits */
-    tmpaa = a->dp + b;
+    /* top [offset into digits] */
+    top = a->dp + b;

    /* this is implemented as a sliding window where 
     * the window is b-digits long and digits from 
@ -53,13 +53,15 @@ mp_rshd (mp_int * a, int b)
                  \-------------------/      ---->
     */
    for (x = 0; x < (a->used - b); x++) {
-      *tmpa++ = *tmpaa++;
+      *bottom++ = *top++;
    }

    /* zero the top digits */
    for (; x < a->used; x++) {
-      *tmpa++ = 0;
+      *bottom++ = 0;
    }
  }
-  mp_clamp (a);
+  
+  /* remove excess digits */
+  a->used -= b;
 }
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@ -35,7 +35,7 @@ mp_set_int (mp_int * a, unsigned int b)
    b <<= 4;

    /* ensure that digits are not clamped off */
-    a->used += 32 / DIGIT_BIT + 2;
+    a->used += 1;
  }
  mp_clamp (a);
  return MP_OKAY;
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@ -19,12 +19,16 @@ int
 mp_sqr (mp_int * a, mp_int * b)
 {
  int     res;
-  if (a->used > KARATSUBA_SQR_CUTOFF) {
+  if (a->used >= TOOM_SQR_CUTOFF) {
+    res = mp_toom_sqr(a, b);
+  } else if (a->used >= KARATSUBA_SQR_CUTOFF) {
    res = mp_karatsuba_sqr (a, b);
  } else {

    /* can we use the fast multiplier? */
-    if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+    if ((a->used * 2 + 1) < MP_WARRAY && 
+         a->used < 
+         (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
      res = fast_s_mp_sqr (a, b);
    } else {
      res = s_mp_sqr (a, b);
--- a/bn_mp_toom_mul.c
+++ b/bn_mp_toom_mul.c
@ -0,0 +1,268 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplication using Toom-Cook 3-way algorithm */
+int 
+mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) {
+       return res;
+    }
+    
+    /* B */
+    B = MIN(a->used, b->used) / 3;
+    
+    /* a = a2 * B^2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+    
+    /* b = b2 * B^2 + b1 * B + b0 */
+    if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(b, &b1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b1, B);
+    mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+
+    if ((res = mp_copy(b, &b2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&b2, B*2);
+    
+    /* w0 = a0*b0 */
+    if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * b2 */
+    if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication 
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &b0, &b1, &b2, &tmp1, &tmp2, NULL);
+     return res;
+}     
+     
--- a/bn_mp_toom_sqr.c
+++ b/bn_mp_toom_sqr.c
@ -0,0 +1,220 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* squaring using Toom-Cook 3-way algorithm */
+int 
+mp_toom_sqr(mp_int *a, mp_int *b)
+{
+    mp_int w0, w1, w2, w3, w4, tmp1, a0, a1, a2;
+    int res, B;
+        
+    /* init temps */
+    if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL)) != MP_OKAY) {
+       return res;
+    }
+
+    /* B */
+    B = a->used / 3;
+    
+    /* a = a2 * B^2 + a1 * B + a0 */
+    if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) {
+       goto ERR;
+    }
+
+    if ((res = mp_copy(a, &a1)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a1, B);
+    mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+
+    if ((res = mp_copy(a, &a2)) != MP_OKAY) {
+       goto ERR;
+    }
+    mp_rshd(&a2, B*2);
+        
+    /* w0 = a0*a0 */
+    if ((res = mp_sqr(&a0, &w0)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w4 = a2 * a2 */
+    if ((res = mp_sqr(&a2, &w4)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w1 = (a2 + 2(a1 + 2a0))**2 */
+    if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_sqr(&tmp1, &w1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* w3 = (a0 + 2(a1 + 2a2))**2 */
+    if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    if ((res = mp_sqr(&tmp1, &w3)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+
+    /* w2 = (a2 + a1 + a0)**2 */
+    if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) {
+       goto ERR;
+    }
+    if ((res = mp_sqr(&tmp1, &w2)) != MP_OKAY) {
+       goto ERR;
+    }
+    
+    /* now solve the matrix 
+    
+       0  0  0  0  1
+       1  2  4  8  16
+       1  1  1  1  1
+       16 8  4  2  1
+       1  0  0  0  0
+       
+       using 12 subtractions, 4 shifts, 2 small divisions and 1 small multiplication.
+     */
+     
+     /* r1 - r4 */
+     if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r0 */
+     if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/2 */
+     if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/2 */
+     if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r2 - r0 - r4 */
+     if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - 8r0 */
+     if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - 8r4 */
+     if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* 3r2 - r1 - r3 */
+     if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1 - r2 */
+     if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3 - r2 */
+     if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r1/3 */
+     if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     /* r3/3 */
+     if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) {
+        goto ERR;
+     }
+     
+     /* at this point shift W[n] by B*n */
+     if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+     if ((res = mp_add(&w0, &w1, b)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) {
+        goto ERR;
+     }
+     if ((res = mp_add(&tmp1, b, b)) != MP_OKAY) {
+        goto ERR;
+     }     
+     
+ERR:
+     mp_clear_multi(&w0, &w1, &w2, &w3, &w4, &a0, &a1, &a2, &tmp1, NULL);
+     return res;
+}     
+     
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@ -45,7 +45,6 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
  olduse = c->used;
  c->used = max + 1;

-  /* set the carry to zero */
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
    register int i;
--- a/bn_s_mp_exptmod.c
+++ b/bn_s_mp_exptmod.c
@ -0,0 +1,211 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int
+s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[256], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init M array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G**x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = 0;
+  bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = 0;
+      bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@ -20,8 +20,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
 {
  mp_int  t;
  int     res, ix, iy, pa;
-  mp_word r, u;
-  mp_digit tmpx, *tmpt;
+  mp_word r;
+  mp_digit u, tmpx, *tmpt;

  pa = a->used;
  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
@ -32,7 +32,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
  for (ix = 0; ix < pa; ix++) {
    /* first calculate the digit at 2*ix */
    /* calculate double precision result */
-    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[ix + ix]) + 
+        ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);

    /* store lower part in result */
    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
@ -44,7 +45,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
    tmpx = a->dp[ix];

    /* alias for where to store the results */
-    tmpt = &(t.dp[ix + ix + 1]);
+    tmpt = t.dp + (ix + ix + 1);
+    
    for (iy = ix + 1; iy < pa; iy++) {
      /* first calculate the product */
      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
@ -60,13 +62,9 @@ s_mp_sqr (mp_int * a, mp_int * b)
      /* get carry */
      u = (r >> ((mp_word) DIGIT_BIT));
    }
-    r = ((mp_word) * tmpt) + u;
-    *tmpt = (mp_digit) (r & ((mp_word) MP_MASK));
-    u = (r >> ((mp_word) DIGIT_BIT));
    /* propagate upwards */
-    ++tmpt;
-    while (u != ((mp_word) 0)) {
-      r = ((mp_word) * tmpt) + ((mp_word) 1);
+    while (u != ((mp_digit) 0)) {
+      r = ((mp_word) * tmpt) + ((mp_word) u);
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
      u = (r >> ((mp_word) DIGIT_BIT));
    }
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@ -33,7 +33,6 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  olduse = c->used;
  c->used = max;

-  /* sub digits from lower part */
  {
    register mp_digit u, *tmpa, *tmpb, *tmpc;
    register int i;
@ -52,7 +51,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
      /* U = carry bit of T[i]
       * Note this saves performing an AND operation since
       * if a carry does occur it will propagate all the way to the
-       * MSB.  As a result a single shift is required to get the carry
+       * MSB.  As a result a single shift is enough to get the carry
       */
      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));

@ -81,3 +80,4 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
  mp_clamp (c);
  return MP_OKAY;
 }
+
--- a/bncore.c
+++ b/bncore.c
@ -18,11 +18,14 @@

 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        81/       110
+ Intel P4               /GCC v3.2     /        70/       108
 AMD Athlon XP          /GCC v3.2     /       109/       127

 */

 /* configured for a AMD XP Thoroughbred core with etc/tune.c */
 int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127;      /* Min. number of digits before Karatsuba squaring is used. */
+        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+        
+        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
+        TOOM_SQR_CUTOFF      = 400; 
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,15 @@
+May 29th, 2003
+v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
+          (e.g. correct result, just bad looking code)
+       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
+       -- Added Toom-Cook multipliers [needs tuning!]
+       -- Added efficient divide by 3 algorithm mp_div_3
+       -- Re-wrote mp_div_d to be faster than calling mp_div 
+       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
+       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
+       -- Made the exptmod system be aware of the 2k reduction algorithms.
+       -- Rewrote mp_dr_reduce to be smaller, simpler and easier to understand.
+
 May 17th, 2003
 v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
          as several smaller suggestions.  Thanks!
--- a/demo/demo.c
+++ b/demo/demo.c
@ -53,7 +53,7 @@ int main(void)
 #ifdef TIMER
   int n;
   ulong64 tt;
-   FILE *log, *logb;
+   FILE *log, *logb, *logc;
 #endif

   mp_init(&a);
@ -62,11 +62,54 @@ int main(void)
   mp_init(&d);
   mp_init(&e);
   mp_init(&f);   
+   
+   srand(time(NULL));
+/* test mp_reduce_2k */
+#if 0
+   for (cnt = 3; cnt <= 4096; ++cnt) {
+       mp_digit tmp;
+       mp_2expt(&a, cnt);
+       mp_sub_d(&a, 1, &a);  /* a = 2**cnt - 1 */
+       
+       
+       printf("\nTesting %4d bits", cnt);
+       printf("(%d)", mp_reduce_is_2k(&a));
+       mp_reduce_2k_setup(&a, &tmp);
+       printf("(%d)", tmp);
+       for (ix = 0; ix < 100000; ix++) {
+           if (!(ix & 1023)) {printf("."); fflush(stdout); }
+           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
+           mp_copy(&c, &b);
+           mp_mod(&c, &a, &c);
+           mp_reduce_2k(&b, &a, 1);
+           if (mp_cmp(&c, &b)) {
+              printf("FAILED\n");
+              exit(0);
+           }
+        }
+    }
+#endif
+
+           
+/* test mp_div_3  */
+#if 0
+   for (cnt = 0; cnt < 1000000; ) {
+      mp_digit r1, r2;
+      
+      if (!(++cnt & 127)) printf("%9d\r", cnt);
+      mp_rand(&a, abs(rand()) % 32 + 1);
+      mp_div_d(&a, 3, &b, &r1);
+      mp_div_3(&a, &c, &r2);
+      
+      if (mp_cmp(&b, &c) || r1 != r2) {
+         printf("Failure\n");
+      }
+   }
+#endif     

 /* test the DR reduction */
 #if 0

-   srand(time(NULL));
   for (cnt = 2; cnt < 32; cnt++) {
       printf("%d digit modulus\n", cnt);
       mp_grow(&a, cnt);
@ -91,6 +134,7 @@ int main(void)

         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);
+                       
         }
      } while (++rr < 1000000);
      printf("Passed DR test for %d digits\n", cnt);
@ -98,6 +142,9 @@ int main(void)
 #endif

 #ifdef TIMER
+      /* temp. turn off TOOM */
+      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+          
      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);

      log = fopen("logs/add.log", "w");
@ -172,9 +219,16 @@ int main(void)
      }
      fclose(log);
   }
-
-   {
+  {
      char *primes[] = {
+         /* 2K moduli mersenne primes */
+         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+         
         /* DR moduli */
         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
@ -196,6 +250,7 @@ int main(void)
      };
   log = fopen("logs/expt.log", "w");
   logb = fopen("logs/expt_dr.log", "w");
+   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
@ -224,11 +279,12 @@ int main(void)
         exit(0);
      }
      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf((n < 7) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
   }
   }
   fclose(log);
   fclose(logb);
+   fclose(logc);

   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
@ -263,6 +319,12 @@ int main(void)

   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0;
+   
+   /* force KARA and TOOM to enable despite cutoffs */
+   KARATSUBA_SQR_CUTOFF = KARATSUBA_MUL_CUTOFF = 110;
+   TOOM_SQR_CUTOFF      = TOOM_MUL_CUTOFF      = 150;
+   
+   

   for (;;) {
       /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@ -0,0 +1,2 @@
+256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
+512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
--- a/etc/2kprime.c
+++ b/etc/2kprime.c
@ -0,0 +1,80 @@
+/* Makes safe primes of a 2k nature */
+#include <tommath.h>
+#include <time.h>
+
+int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
+
+int main(void)
+{
+   char buf[2000];
+   int x, y, t;
+   mp_int q, p;
+   FILE *out;
+   clock_t t1;
+   mp_digit z;
+   
+   mp_init_multi(&q, &p, NULL);
+   
+   out = fopen("2kprime.1", "w");
+   for (x = 0; x < (int)(sizeof(sizes) / sizeof(sizes[0])); x++) {
+   top:
+       mp_2expt(&q, sizes[x]);
+       mp_add_d(&q, 3, &q);
+       z = -3;
+       
+       t1 = clock();
+       for(;;) {
+         mp_sub_d(&q, 4, &q);
+         z += 4;
+
+         if (z > MP_MASK) {
+            printf("No primes of size %d found\n", sizes[x]);
+            break;
+         }
+         
+         if (clock() - t1 > CLOCKS_PER_SEC) { 
+            printf("."); fflush(stdout);
+//            sleep((clock() - t1 + CLOCKS_PER_SEC/2)/CLOCKS_PER_SEC);
+            t1 = clock();
+         }
+         
+         /* quick test on q */
+         mp_prime_is_prime(&q, 1, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* find (q-1)/2 */
+         mp_sub_d(&q, 1, &p);
+         mp_div_2(&p, &p);
+         mp_prime_is_prime(&p, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         /* test on q */
+         mp_prime_is_prime(&q, 3, &y);
+         if (y == 0) {
+            continue;
+         }
+
+         break;
+       }
+       
+       if (y == 0) {
+          ++sizes[x];
+          goto top;
+       }
+       
+       mp_toradix(&q, buf, 10);
+       printf("\n\n%d-bits (k = %lu) = %s\n", sizes[x], z, buf);
+       fprintf(out, "%d-bits (k = %lu) = %s\n", sizes[x], z, buf); fflush(out);
+   }
+   
+   return 0;
+}   
+       
+         
+            
+            
+          
--- a/etc/makefile
+++ b/etc/makefile
@ -32,9 +32,13 @@ mersenne: mersenne.o
 drprime: drprime.o
 	$(CC) drprime.o $(LIBNAME) -o drprime
 	
+# fines 2k safe primes for the given config
+2kprime: 2kprime.o
+	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
+
 mont: mont.o
 	$(CC) mont.o $(LIBNAME) -o mont

        
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime
--- a/etc/makefile.msvc
+++ b/etc/makefile.msvc
@ -14,4 +14,7 @@ tune: tune.obj
 	cl tune.obj ../tommath.lib
 	
 drprime: drprime.obj
-	cl drprime.obj ../tommath.lib
+	cl drprime.obj ../tommath.lib
+
+2kprime: 2kprime.obj
+	cl 2kprime.obj ../tommath.lib
--- a/etc/mersenne.c
+++ b/etc/mersenne.c
@ -8,10 +8,9 @@
 int
 is_mersenne (long s, int *pp)
 {
-  mp_int  n, u, mu;
+  mp_int  n, u;
  int     res, k;
-  long    ss;
-
+  
  *pp = 0;

  if ((res = mp_init (&n)) != MP_OKAY) {
@ -22,27 +21,14 @@ is_mersenne (long s, int *pp)
    goto __N;
  }

-  if ((res = mp_init (&mu)) != MP_OKAY) {
-    goto __U;
-  }
-
  /* n = 2^s - 1 */
-  mp_set (&n, 1);
-  ss = s;
-  while (ss--) {
-    if ((res = mp_mul_2 (&n, &n)) != MP_OKAY) {
-      goto __MU;
-    }
+  if ((res = mp_2expt(&n, s)) != MP_OKAY) {
+     goto __MU;
  }
  if ((res = mp_sub_d (&n, 1, &n)) != MP_OKAY) {
    goto __MU;
  }

-  /* setup mu */
-  if ((res = mp_reduce_setup (&mu, &n)) != MP_OKAY) {
-    goto __MU;
-  }
-
  /* set u=4 */
  mp_set (&u, 4);

@ -57,26 +43,26 @@ is_mersenne (long s, int *pp)
    }

    /* make sure u is positive */
-    if (u.sign == MP_NEG) {
+    while (u.sign == MP_NEG) {
      if ((res = mp_add (&u, &n, &u)) != MP_OKAY) {
-	goto __MU;
+         goto __MU;
      }
    }

    /* reduce */
-    if ((res = mp_reduce (&u, &n, &mu)) != MP_OKAY) {
+    if ((res = mp_reduce_2k (&u, &n, 1)) != MP_OKAY) {
      goto __MU;
    }
  }

  /* if u == 0 then its prime */
  if (mp_iszero (&u) == 1) {
-    *pp = 1;
+    mp_prime_is_prime(&n, 3, pp);
+  if (*pp != 1) printf("FAILURE\n");
  }

  res = MP_OKAY;
-__MU:mp_clear (&mu);
-__U:mp_clear (&u);
+__MU:mp_clear (&u);
 __N:mp_clear (&n);
  return res;
 }
--- a/etc/mont.c
+++ b/etc/mont.c
@ -7,10 +7,11 @@ int main(void)
   mp_digit mp;
   long x, y;

+   srand(time(NULL));
   mp_init_multi(&modulus, &R, &p, &pp, NULL);

   /* loop through various sizes */
-   for (x = 4; x < 128; x++) {
+   for (x = 4; x < 256; x++) {
       printf("DIGITS == %3ld...", x); fflush(stdout);
       
       /* make up the odd modulus */
@ -22,7 +23,7 @@ int main(void)
       mp_montgomery_setup(&modulus, &mp);
       
       /* now run through a bunch tests */
-       for (y = 0; y < 100000; y++) {
+       for (y = 0; y < 1000; y++) {
           mp_rand(&p, x/2);        /* p = random */
           mp_mul(&p, &R, &pp);     /* pp = R * p */
           mp_montgomery_reduce(&pp, &modulus, mp);
--- a/etc/tune.c
+++ b/etc/tune.c
@ -8,17 +8,17 @@
 #ifndef X86_TIMER

 /* generic ISO C timer */
-unsigned long long __T;
+ulong64 __T;
 void t_start(void) { __T = clock(); }
-unsigned long long t_read(void) { return clock() - __T; }
+ulong64 t_read(void) { return clock() - __T; }

 #else
 extern void t_start(void);
-extern unsigned long long t_read(void);
+extern ulong64 t_read(void);
 #endif

-unsigned long long
-time_mult (void)
+ulong64
+time_mult (int max)
 {
  int     x, y;
  mp_int  a, b, c;
@ -28,7 +28,7 @@ time_mult (void)
  mp_init (&c);

  t_start();
-  for (x = 32; x <= 288; x += 4) {
+  for (x = 32; x <= max; x += 4) {
    mp_rand (&a, x);
    mp_rand (&b, x);
    for (y = 0; y < 100; y++) {
@ -41,8 +41,8 @@ time_mult (void)
  return t_read();
 }

-unsigned long long
-time_sqr (void)
+ulong64
+time_sqr (int max)
 {
  int     x, y;
  mp_int  a, b;
@ -51,7 +51,7 @@ time_sqr (void)
  mp_init (&b);

  t_start();
-  for (x = 32; x <= 288; x += 4) {
+  for (x = 32; x <= max; x += 4) {
    mp_rand (&a, x);
    for (y = 0; y < 100; y++) {
      mp_sqr (&a, &b);
@ -65,45 +65,85 @@ time_sqr (void)
 int
 main (void)
 {
-  int     best_mult, best_square;
-  unsigned long long best, ti;
+  int     best_kmult, best_tmult, best_ksquare, best_tsquare;
+  ulong64 best, ti;
  FILE   *log;

-  best_mult = best_square = 0;
+  best_kmult = best_ksquare = best_tmult = best_tsquare = 0;
  /* tune multiplication first */
+  
+  /* effectively turn TOOM off */
+  TOOM_SQR_CUTOFF = TOOM_MUL_CUTOFF = 100000;
+    
  log = fopen ("mult.log", "w");
  best = -1;
  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) {
-    ti = time_mult ();
+    ti = time_mult (300);
    printf ("%4d : %9llu\r", KARATSUBA_MUL_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
      best = ti;
-      best_mult = KARATSUBA_MUL_CUTOFF;
+      best_kmult = KARATSUBA_MUL_CUTOFF;
    }
  }
  fclose (log);
+  
  /* tune squaring */
  log = fopen ("sqr.log", "w");
  best = -1;
  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) {
-    ti = time_sqr ();
+    ti = time_sqr (300);
    printf ("%4d : %9llu\r", KARATSUBA_SQR_CUTOFF, ti);
    fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti);
    fflush (stdout);
    if (ti < best) {
      printf ("New best: %llu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
      best = ti;
-      best_square = KARATSUBA_SQR_CUTOFF;
+      best_ksquare = KARATSUBA_SQR_CUTOFF;
    }
  }
  fclose (log);
+  
+  KARATSUBA_MUL_CUTOFF = best_kmult;
+  KARATSUBA_SQR_CUTOFF = best_ksquare;
+    
+  /* tune TOOM mult */
+  log = fopen ("tmult.log", "w");
+  best = -1;
+  for (TOOM_MUL_CUTOFF = best_kmult*5; TOOM_MUL_CUTOFF <= 800; TOOM_MUL_CUTOFF++) {
+    ti = time_mult (1200);
+    printf ("%4d : %9llu\r", TOOM_MUL_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", TOOM_MUL_CUTOFF, ti);
+    fflush (stdout);
+    if (ti < best) {
+      printf ("New best: %llu, %d         \n", ti, TOOM_MUL_CUTOFF);
+      best = ti;
+      best_tmult = TOOM_MUL_CUTOFF;
+    }
+  }
+  fclose (log);   
+  
+  /* tune TOOM sqr */
+  log = fopen ("tsqr.log", "w");
+  best = -1;
+  for (TOOM_SQR_CUTOFF = best_ksquare*3; TOOM_SQR_CUTOFF <= 800; TOOM_SQR_CUTOFF++) {
+    ti = time_sqr (1200);
+    printf ("%4d : %9llu\r", TOOM_SQR_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", TOOM_SQR_CUTOFF, ti);
+    fflush (stdout);
+    if (ti < best) {
+      printf ("New best: %llu, %d         \n", ti, TOOM_SQR_CUTOFF);
+      best = ti;
+      best_tsquare = TOOM_SQR_CUTOFF;
+    }
+  }
+  fclose (log);   

  printf
-    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n",
-     best_mult, best_square);
+    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\nToom Multiplier Cutoff: %d\nToom Squaring Cutoff: %d\n",
+     best_kmult, best_ksquare, best_tmult, best_tsquare);

  return 0;
 }
--- a/gen.pl
+++ b/gen.pl
@ -6,7 +6,7 @@
 use strict;

 open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
-foreach my $filename (glob "bn_*.c") {
+foreach my $filename (glob "bn*.c") {
   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
   print OUT "/* Start: $filename */\n";
   print OUT qq[#line 0 "$filename"\n];
@ -14,5 +14,5 @@ foreach my $filename (glob "bn_*.c") {
   print OUT "\n/* End: $filename */\n\n";
   close SRC or die "Error closing $filename after reading: $!";
 }
-print OUT "\b/* EOF */\n";
+print OUT "\n/* EOF */\n";
 close OUT or die "Error closing mpi.c after writing: $!";
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224  11039864
-448   9206336
-672   8178200
-896   7432176
-1120   6433264
-1344   5847056
-1568   5270184
-1792   4943416
-2016   4520016
-2240   4256168
-2464   3999224
-2688   3714896
-2912   3572720
-3136   3340176
-3360   3222584
-3584   3036336
+224  11069160
+448   9156136
+672   8089755
+896   7399424
+1120   6389352
+1344   5818648
+1568   5257112
+1792   4982160
+2016   4527856
+2240   4325312
+2464   4051760
+2688   3767640
+2912   3612520
+3136   3415208
+3360   3258656
+3584   3113360
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -1,7 +1,7 @@
-14364       666
-21532       253
-28700       117
-57372        17
-71708         9
-86044         5
-114716         2
+513       680
+769       257
+1025       117
+2049        17
+2561         9
+3073         5
+4097         2
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -0,0 +1,6 @@
+521       736
+607       552
+1279       112
+2203        33
+3217        13
+4253         6
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -1,7 +1,7 @@
-14896      1088
-21952       468
-29008       244
-43120        91
-58016        43
-86240        15
-115248         6
+532      1064
+784       460
+1036       240
+1540        91
+2072        43
+3080        15
+4116         6
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -1,5 +1,5 @@
 set terminal png color
-set size 1.5
+set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"

@ -10,7 +10,7 @@ set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"

 set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"

 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +1,32 @@
-112     15608
-224      7840
-336      5104
-448      3376
-560      2616
-672      1984
-784      1640
-896      2056
-1008      1136
-1120       936
-1232      1240
-1344      1112
-1456       608
-1568       873
-1680       492
-1792       444
-1904       640
-2016       584
-2128       328
-2240       307
-2352       283
-2464       256
-2576       393
-2688       365
-2800       344
-2912       196
-3024       301
-3136       170
-3248       160
-3360       250
-3472       144
-3584       224
+112     16248
+224      8192
+336      5320
+448      3560
+560      2728
+672      2064
+784      1704
+896      2176
+1008      1184
+1120       976
+1232      1280
+1344      1176
+1456       624
+1568       912
+1680       504
+1792       452
+1904       658
+2016       608
+2128       336
+2240       312
+2352       288
+2464       264
+2576       408
+2688       376
+2800       354
+2912       198
+3024       307
+3136       173
+3248       162
+3360       256
+3472       145
+3584       226
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/k7/README
+++ b/logs/k7/README
@ -0,0 +1,13 @@
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
+Tom
--- a/logs/k7/add.log
+++ b/logs/k7/add.log
@ -0,0 +1,16 @@
+224  11069160
+448   9156136
+672   8089755
+896   7399424
+1120   6389352
+1344   5818648
+1568   5257112
+1792   4982160
+2016   4527856
+2240   4325312
+2464   4051760
+2688   3767640
+2912   3612520
+3136   3415208
+3360   3258656
+3584   3113360
--- a/logs/k7/addsub.png
+++ b/logs/k7/addsub.png
--- a/logs/k7/expt.log
+++ b/logs/k7/expt.log
@ -0,0 +1,7 @@
+513       664
+769       256
+1025       117
+2049        17
+2561         9
+3073         5
+4097         2
--- a/logs/k7/expt.png
+++ b/logs/k7/expt.png
--- a/logs/k7/expt_dr.log
+++ b/logs/k7/expt_dr.log
@ -0,0 +1,7 @@
+532      1088
+784       460
+1036       240
+1540        92
+2072        43
+3080        15
+4116         6
--- a/logs/k7/graphs.dem
+++ b/logs/k7/graphs.dem
@ -0,0 +1,17 @@
+set terminal png color
+set size 1.75
+set ylabel "Operations per Second"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
--- a/logs/k7/index.html
+++ b/logs/k7/index.html
@ -0,0 +1,24 @@
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
+</html>
--- a/logs/k7/invmod.log
+++ b/logs/k7/invmod.log
@ -0,0 +1,32 @@
+112     16248
+224      8192
+336      5320
+448      3560
+560      2728
+672      2064
+784      1704
+896      2176
+1008      1184
+1120       976
+1232      1280
+1344      1176
+1456       624
+1568       912
+1680       504
+1792       452
+1904       658
+2016       608
+2128       336
+2240       312
+2352       288
+2464       264
+2576       408
+2688       376
+2800       354
+2912       198
+3024       307
+3136       173
+3248       162
+3360       256
+3472       145
+3584       226
--- a/logs/k7/invmod.png
+++ b/logs/k7/invmod.png
--- a/logs/k7/mult.log
+++ b/logs/k7/mult.log
@ -0,0 +1,17 @@
+896    322904
+1344    151592
+1792     90472
+2240     59984
+2688     42624
+3136     31872
+3584     24704
+4032     19704
+4480     16096
+4928     13376
+5376     11272
+5824      9616
+6272      8360
+6720      7304
+7168      1664
+7616      1472
+8064      1328
--- a/logs/k7/mult.png
+++ b/logs/k7/mult.png
--- a/logs/k7/mult_kara.log
+++ b/logs/k7/mult_kara.log
@ -0,0 +1,17 @@
+896    322872
+1344    151688
+1792     90480
+2240     59984
+2688     42656
+3136     32144
+3584     25840
+4032     21328
+4480     17856
+4928     14928
+5376     12856
+5824     11256
+6272      9880
+6720      8984
+7168      7928
+7616      7200
+8064      6576
--- a/logs/k7/sqr.log
+++ b/logs/k7/sqr.log
@ -0,0 +1,17 @@
+896    415472
+1344    223736
+1792    141232
+2240     97624
+2688     71400
+3136     54800
+3584     16904
+4032     13528
+4480     10968
+4928      9128
+5376      7784
+5824      6672
+6272      5760
+6720      5056
+7168      4440
+7616      3952
+8064      3512
--- a/logs/k7/sqr_kara.log
+++ b/logs/k7/sqr_kara.log
@ -0,0 +1,17 @@
+896    420464
+1344    224800
+1792    142808
+2240     97704
+2688     71416
+3136     54504
+3584     38320
+4032     32360
+4480     27576
+4928     23840
+5376     20688
+5824     18264
+6272     16176
+6720     14440
+7168     11688
+7616     10752
+8064      9936
--- a/logs/k7/sub.log
+++ b/logs/k7/sub.log
@ -0,0 +1,16 @@
+224   9728504
+448   8573648
+672   7488096
+896   6714064
+1120   5950472
+1344   5457400
+1568   5038896
+1792   4683632
+2016   4384656
+2240   4105976
+2464   3871608
+2688   3650680
+2912   3463552
+3136   3290016
+3360   3135272
+3584   2993848
--- a/logs/mult.log
+++ b/logs/mult.log
@ -1,17 +1,17 @@
-896    321504
-1344    150784
-1792     90288
-2240     59760
-2688     42480
-3136     32056
-3584     24600
-4032     19656
-4480     16024
-4928     13328
-5376     11280
-5824      9624
-6272      8336
-6720      7280
-7168      1648
-7616      1464
-8064      1296
+896    322904
+1344    151592
+1792     90472
+2240     59984
+2688     42624
+3136     31872
+3584     24704
+4032     19704
+4480     16096
+4928     13376
+5376     11272
+5824      9616
+6272      8360
+6720      7304
+7168      1664
+7616      1472
+8064      1328
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,17 +1,17 @@
-896    321928
-1344    150752
-1792     90136
-2240     59888
-2688     42480
-3136     32080
-3584     25744
-4032     21216
-4480     17912
-4928     14896
-5376     12936
-5824     11216
-6272      9848
-6720      8896
-7168      7968
-7616      7248
-8064      6600
+896    322872
+1344    151688
+1792     90480
+2240     59984
+2688     42656
+3136     32144
+3584     25840
+4032     21328
+4480     17856
+4928     14928
+5376     12856
+5824     11256
+6272      9880
+6720      8984
+7168      7928
+7616      7200
+8064      6576
--- a/logs/p4/README
+++ b/logs/p4/README
@ -0,0 +1,13 @@
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
+Tom
--- a/logs/p4/add.log
+++ b/logs/p4/add.log
@ -0,0 +1,16 @@
+224   8113248
+448   6585584
+672   5687678
+896   4761144
+1120   4111592
+1344   3995154
+1568   3532387
+1792   3225400
+2016   2963960
+2240   2720112
+2464   2533952
+2688   2307168
+2912   2287064
+3136   2150160
+3360   2035992
+3584   1936304
--- a/logs/p4/addsub.png
+++ b/logs/p4/addsub.png
--- a/logs/p4/expt.log
+++ b/logs/p4/expt.log
@ -0,0 +1,7 @@
+513       195
+769        68
+1025        31
+2049         4
+2561         2
+3073         1
+4097         0
--- a/logs/p4/expt.png
+++ b/logs/p4/expt.png
--- a/logs/p4/expt_dr.log
+++ b/logs/p4/expt_dr.log
@ -0,0 +1,7 @@
+532       393
+784       158
+1036        79
+1540        27
+2072        12
+3080         4
+4116         1
--- a/logs/p4/graphs.dem
+++ b/logs/p4/graphs.dem
@ -0,0 +1,17 @@
+set terminal png color
+set size 1.75
+set ylabel "Operations per Second"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
--- a/logs/p4/index.html
+++ b/logs/p4/index.html
@ -0,0 +1,24 @@
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
+</html>
--- a/logs/p4/invmod.log
+++ b/logs/p4/invmod.log
@ -0,0 +1,32 @@
+112     13608
+224      6872
+336      4264
+448      2792
+560      2144
+672      1560
+784      1296
+896      1672
+1008       896
+1120       736
+1232      1024
+1344       888
+1456       472
+1568       680
+1680       373
+1792       328
+1904       484
+2016       436
+2128       232
+2240       211
+2352       200
+2464       177
+2576       293
+2688       262
+2800       251
+2912       137
+3024       216
+3136       117
+3248       113
+3360       181
+3472        98
+3584       158
--- a/logs/p4/invmod.png
+++ b/logs/p4/invmod.png
--- a/logs/p4/mult.log
+++ b/logs/p4/mult.log
@ -0,0 +1,17 @@
+896     77600
+1344     35776
+1792     19688
+2240     13248
+2688      9424
+3136      7056
+3584      5464
+4032      4368
+4480      3568
+4928      2976
+5376      2520
+5824      2152
+6272      1872
+6720      1632
+7168       650
+7616       576
+8064       515
--- a/logs/p4/mult.png
+++ b/logs/p4/mult.png
--- a/logs/p4/mult_kara.log
+++ b/logs/p4/mult_kara.log
@ -0,0 +1,17 @@
+896     77752
+1344     35832
+1792     19688
+2240     14704
+2688     10832
+3136      8336
+3584      6600
+4032      5424
+4480      4648
+4928      3976
+5376      3448
+5824      3016
+6272      2664
+6720      2384
+7168      2120
+7616      1912
+8064      1752
--- a/logs/p4/sqr.log
+++ b/logs/p4/sqr.log
@ -0,0 +1,17 @@
+896    128088
+1344     63640
+1792     37968
+2240     25488
+2688     18176
+3136     13672
+3584      4920
+4032      3912
+4480      3160
+4928      2616
+5376      2216
+5824      1896
+6272      1624
+6720      1408
+7168      1240
+7616      1096
+8064       984
--- a/logs/p4/sqr_kara.log
+++ b/logs/p4/sqr_kara.log
@ -0,0 +1,17 @@
+896    127456
+1344     63752
+1792     37920
+2240     25440
+2688     18200
+3136     13728
+3584     10968
+4032      9072
+4480      7608
+4928      6440
+5376      5528
+5824      4768
+6272      4328
+6720      3888
+7168      3504
+7616      3176
+8064      2896
--- a/logs/p4/sub.log
+++ b/logs/p4/sub.log
@ -0,0 +1,16 @@
+224   7355896
+448   6162880
+672   5218984
+896   4622776
+1120   3999320
+1344   3629480
+1568   3290384
+1792   2954752
+2016   2737056
+2240   2563320
+2464   2451928
+2688   2310920
+2912   2139048
+3136   2034080
+3360   1890800
+3584   1808624
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -1,17 +1,17 @@
-896    416968
-1344    223672
-1792    141552
-2240     97280
-2688     71304
-3136     54648
-3584     16264
-4032     13000
-4480     10528
-4928      8776
-5376      7464
-5824      6440
-6272      5520
-6720      4808
-7168      4264
-7616      3784
-8064      3368
+896    415472
+1344    223736
+1792    141232
+2240     97624
+2688     71400
+3136     54800
+3584     16904
+4032     13528
+4480     10968
+4928      9128
+5376      7784
+5824      6672
+6272      5760
+6720      5056
+7168      4440
+7616      3952
+8064      3512
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,17 +1,17 @@
-896    416656
-1344    223728
-1792    141288
-2240     97456
-2688     71152
-3136     54392
-3584     38552
-4032     32216
-4480     27384
-4928     23792
-5376     20728
-5824     18232
-6272     16160
-6720     14408
-7168     11696
-7616     10768
-8064      9920
+896    420464
+1344    224800
+1792    142808
+2240     97704
+2688     71416
+3136     54504
+3584     38320
+4032     32360
+4480     27576
+4928     23840
+5376     20688
+5824     18264
+6272     16176
+6720     14440
+7168     11688
+7616     10752
+8064      9936
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224   9862520
-448   8562344
-672   7661400
-896   6838128
-1120   5911144
-1344   5394040
-1568   4993760
-1792   4624240
-2016   4332024
-2240   4029312
-2464   3790784
-2688   3587216
-2912   3397952
-3136   3239736
-3360   3080616
-3584   2933104
+224   9728504
+448   8573648
+672   7488096
+896   6714064
+1120   5950472
+1344   5457400
+1568   5038896
+1792   4683632
+2016   4384656
+2240   4105976
+2464   3871608
+2688   3650680
+2912   3463552
+3136   3290016
+3360   3135272
+3584   2993848
--- a/16
+++ b/16
@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops

-VERSION=0.17
+VERSION=0.18

 default: libtommath.a

@ -33,7 +33,9 @@ bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
 bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \
-bn_mp_dr_is_modulus.o bn_mp_dr_setup.o
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o

 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -63,6 +65,11 @@ docdvi: tommath.src
 	makeindex tommath
 	latex tommath > /dev/null
 		
+# poster, makes the single page PDF poster
+poster: poster.tex
+	pdflatex poster
+	rm -f poster.aux poster.log 
+	
 # makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
 docs:	
 	cd pics ; make pdfes
@ -88,11 +95,12 @@ manual:
 	
 clean:
 	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c 
+        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c \
+        poster.aux poster.dvi poster.log
 	cd etc ; make clean
 	cd pics ; make clean

-zipup: clean manual
+zipup: clean manual poster
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
--- a/makefile.bcc
+++ b/makefile.bcc
@ -0,0 +1,37 @@
+#
+# Borland C++Builder Makefile (makefile.bcc)
+#
+
+
+LIB = tlib
+CC = bcc32
+CFLAGS = -c -O2 -I.
+
+OBJECTS=bncore.obj bn_mp_init.obj bn_mp_clear.obj bn_mp_exch.obj bn_mp_grow.obj bn_mp_shrink.obj \
+bn_mp_clamp.obj bn_mp_zero.obj  bn_mp_set.obj bn_mp_set_int.obj bn_mp_init_size.obj bn_mp_copy.obj \
+bn_mp_init_copy.obj bn_mp_abs.obj bn_mp_neg.obj bn_mp_cmp_mag.obj bn_mp_cmp.obj bn_mp_cmp_d.obj \
+bn_mp_rshd.obj bn_mp_lshd.obj bn_mp_mod_2d.obj bn_mp_div_2d.obj bn_mp_mul_2d.obj bn_mp_div_2.obj \
+bn_mp_mul_2.obj bn_s_mp_add.obj bn_s_mp_sub.obj bn_fast_s_mp_mul_digs.obj bn_s_mp_mul_digs.obj \
+bn_fast_s_mp_mul_high_digs.obj bn_s_mp_mul_high_digs.obj bn_fast_s_mp_sqr.obj bn_s_mp_sqr.obj \
+bn_mp_add.obj bn_mp_sub.obj bn_mp_karatsuba_mul.obj bn_mp_mul.obj bn_mp_karatsuba_sqr.obj \
+bn_mp_sqr.obj bn_mp_div.obj bn_mp_mod.obj bn_mp_add_d.obj bn_mp_sub_d.obj bn_mp_mul_d.obj \
+bn_mp_div_d.obj bn_mp_mod_d.obj bn_mp_expt_d.obj bn_mp_addmod.obj bn_mp_submod.obj \
+bn_mp_mulmod.obj bn_mp_sqrmod.obj bn_mp_gcd.obj bn_mp_lcm.obj bn_fast_mp_invmod.obj bn_mp_invmod.obj \
+bn_mp_reduce.obj bn_mp_montgomery_setup.obj bn_fast_mp_montgomery_reduce.obj bn_mp_montgomery_reduce.obj \
+bn_mp_exptmod_fast.obj bn_mp_exptmod.obj bn_mp_2expt.obj bn_mp_n_root.obj bn_mp_jacobi.obj bn_reverse.obj \
+bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp_to_unsigned_bin.obj \
+bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
+bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
+bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
+bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
+
+TARGET = libtommath.lib
+
+$(TARGET): $(OBJECTS)
+
+.c.objbj:
+	$(CC) $(CFLAGS) $<
+	$(LIB) $(TARGET) -+$@
--- a/makefile.msvc
+++ b/makefile.msvc
@ -23,7 +23,10 @@ bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
 bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
-bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj bn_mp_reduce_setup.obj \
+bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
+bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj
+


 library: $(OBJECTS)
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -40,14 +40,10 @@ void rand_num(mp_int *a)
   int n, size;
   unsigned char buf[2048];

-top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 1024);
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 1031;
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
-       if (buf[n+1]) break;
-   }
-   if (n == size) goto top;
+   while (buf[1] == 0) buf[1] = fgetc(rng);
   mp_read_raw(a, buf, 1+size);
 }

@ -56,14 +52,10 @@ void rand_num2(mp_int *a)
   int n, size;
   unsigned char buf[2048];

-top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
+   size = 1 + ((fgetc(rng)<<8) + fgetc(rng)) % 97;
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
-   for (n = 0; n < size; n++) {
-       if (buf[n+1]) break;
-   }
-   if (n == size) goto top;
+   while (buf[1] == 0) buf[1] = fgetc(rng);
   mp_read_raw(a, buf, 1+size);
 }

@ -73,6 +65,7 @@ int main(void)
 {
   int n;
   mp_int a, b, c, d, e;
+   clock_t t1;
   char buf[4096];

   mp_init(&a);
@ -108,8 +101,14 @@ int main(void)
      }
   }

+   t1 = clock();
   for (;;) {
-       n =  fgetc(rng) % 13;
+      if (clock() - t1 > CLOCKS_PER_SEC) {
+         sleep(1);
+         t1 = clock();
+      }
+   
+       n = fgetc(rng) % 13;

   if (n == 0) {
       /* add tests */
@ -227,6 +226,7 @@ int main(void)
      rand_num2(&a);
      rand_num2(&b);
      rand_num2(&c);
+//      if (c.dp[0]&1) mp_add_d(&c, 1, &c);
      a.sign = b.sign = c.sign = 0;
      mp_exptmod(&a, &b, &c, &d);
      printf("expt\n");
--- a/pics/expt_state.sxd
+++ b/pics/expt_state.sxd
--- a/Show More
+++ b/Show More