diff --git a/bn.pdf b/bn.pdf
index d047a83..b81b577 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 980d6b9..8ba2964 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
-\documentclass[]{report}
+\documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.16 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.17 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index 68cdf1c..eb71601 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -27,41 +27,18 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   int     res, neg;
 
   /* init all our temps */
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __B;
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
   }
 
   /* x == modulus, y == value to invert */
   if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (a, &y)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
 
-  /* we need |y| */
-  if ((res = mp_abs (&y, &y)) != MP_OKAY) {
-    goto __D;
+  /* we need y = |a| */
+  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+    goto __ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! 
@@ -70,15 +47,15 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
    */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   mp_set (&D, 1);
 
@@ -87,17 +64,17 @@ top:
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_iseven (&B) == 0) {
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
       }
     }
     /* B = B/2 */
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -105,18 +82,18 @@ top:
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 5.2 if C,D are even then */
     if (mp_iseven (&D) == 0) {
       /* D = (D-x)/2 */
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
+        goto __ERR;
       }
     }
     /* D = D/2 */
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -124,20 +101,20 @@ top:
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   } else {
     /* v - v - u, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -151,26 +128,20 @@ top:
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* b is now the inverse */
   neg = a->sign;
   while (D.sign == MP_NEG) {
     if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
   mp_exch (&D, c);
   c->sign = neg;
   res = MP_OKAY;
 
-__D:mp_clear (&D);
-__B:mp_clear (&B);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
+__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
   return res;
 }
diff --git a/bn_fast_mp_montgomery_reduce.c b/bn_fast_mp_montgomery_reduce.c
index 031b410..7591902 100644
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@@ -26,7 +26,7 @@ int
 fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
   int     ix, res, olduse;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* get old used count */
   olduse = a->used;
@@ -92,7 +92,7 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 
       /* inner loop */
       for (iy = 0; iy < m->used; iy++) {
-	*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
       }
     }
 
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
index 3cba3e1..d09489d 100644
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -16,14 +16,16 @@
 
 /* Fast (comba) multiplier
  *
- * This is the fast column-array [comba] multiplier.  It is designed to compute
- * the columns of the product first then handle the carries afterwards.  This
- * has the effect of making the nested loops that compute the columns very
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
  * simple and schedulable on super-scalar processors.
  *
- * This has been modified to produce a variable number of digits of output so
- * if say only a half-product is required you don't have to compute the upper half
- * (a feature required for fast Barrett reduction).
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
  *
  * Based on Algorithm 14.12 on pp.595 of HAC.
  *
@@ -32,7 +34,7 @@ int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     olduse, res, pa, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* grow the destination as required */
   if (c->alloc < digs) {
@@ -47,10 +49,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   /* calculate the columns */
   pa = a->used;
   for (ix = 0; ix < pa; ix++) {
-
-    /* this multiplier has been modified to allow you to control how many digits 
-     * of output are produced.  So at most we want to make upto "digs" digits
-     * of output.
+    /* this multiplier has been modified to allow you to 
+     * control how many digits of output are produced.  
+     * So at most we want to make upto "digs" digits of output.
      *
      * this adds products to distinct columns (at ix+iy) of W
      * note that each step through the loop is not dependent on
@@ -73,14 +74,14 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
        */
       _W = W + ix;
 
-      /* the number of digits is limited by their placement.  E.g. 
+      /* the number of digits is limited by their placement.  E.g.
          we avoid multiplying digits that will end up above the # of
          digits of precision requested
        */
       pb = MIN (b->used, digs - ix);
 
       for (iy = 0; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
 
@@ -97,11 +98,12 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
      * correct result we must take the extra bits from each column and
      * carry them down
      *
-     * Note that while this adds extra code to the multiplier it saves time
-     * since the carry propagation is removed from the above nested loop.
-     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-     * this is slower but on most cryptographic size numbers it is faster.
+     * Note that while this adds extra code to the multiplier it 
+     * saves time since the carry propagation is removed from the 
+     * above nested loop.This has the effect of reducing the work 
+     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+     * cost of the shifting.  On very small numbers this is slower 
+     * but on most cryptographic size numbers it is faster.
      */
     tmpc = c->dp;
     for (ix = 1; ix < digs; ix++) {
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
index 4a21441..1cc1639 100644
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -27,7 +27,7 @@ int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   int     oldused, newused, res, pa, pb, ix;
-  mp_word W[512];
+  mp_word W[MP_WARRAY];
 
   /* calculate size of product and allocate more space if required */
   newused = a->used + b->used + 1;
@@ -55,15 +55,23 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
       /* alias for right side */
       tmpy = b->dp + iy;
-
+     
       /* alias for the columns of output.  Offset to be equal to or above the 
        * smallest digit place requested 
        */
-      _W = &(W[digs]);
+      _W = W + digs;     
+      
+      /* skip cases below zero where ix > digs */
+      if (iy < 0) {
+         iy    = abs(iy);
+         tmpy += iy;
+         _W   += iy;
+         iy    = 0;
+      }
 
       /* compute column products for digits above the minimum */
       for (; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
   }
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
index 093bc89..7ce3839 100644
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -20,7 +20,7 @@
  * then the carries are computed.  This has the effect of making a very simple
  * inner loop that is executed the most
  *
- * W2 represents the outer products and W the inner.  
+ * W2 represents the outer products and W the inner.
  *
  * A further optimizations is made because the inner products are of the form
  * "A * B * 2".  The *2 part does not need to be computed until the end which is
@@ -33,7 +33,7 @@ int
 fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
   int     olduse, newused, res, ix, pa;
-  mp_word W2[512], W[512];
+  mp_word W2[MP_WARRAY], W[MP_WARRAY];
 
   /* calculate size of product and allocate as required */
   pa = a->used;
@@ -44,9 +44,9 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
     }
   }
 
-  /* zero temp buffer (columns) 
+  /* zero temp buffer (columns)
    * Note that there are two buffers.  Since squaring requires
-   * a outter and inner product and the inner product requires 
+   * a outter and inner product and the inner product requires
    * computing a product and doubling it (a relatively expensive
    * op to perform n^2 times if you don't have to) the inner and
    * outer products are computed in different buffers.  This way
@@ -60,7 +60,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
  * values in W2 are only written in even locations which means
  * we can collapse the array to 256 words [and fixup the memset above]
  * provided we also fix up the summations below.  Ideally
- * the fixup loop should be unrolled twice to handle the even/odd 
+ * the fixup loop should be unrolled twice to handle the even/odd
  * cases, and then a final step to handle odd cases [e.g. newused == odd]
  *
  * This will not only save ~8*256 = 2KB of stack but lower the number of
@@ -71,10 +71,10 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
    * the multiplication by two is done afterwards in the N loop.
    */
   for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product 
+    /* compute the outer product
      *
-     * Note that every outer product is computed 
-     * for a particular column only once which means that 
+     * Note that every outer product is computed
+     * for a particular column only once which means that
      * there is no need todo a double precision addition
      */
     W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
@@ -95,7 +95,7 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
 
       /* inner products */
       for (iy = ix + 1; iy < pa; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
       }
     }
   }
diff --git a/bn_mp_add.c b/bn_mp_add.c
index 02f130a..43a08ab 100644
--- a/bn_mp_add.c
+++ b/bn_mp_add.c
@@ -24,33 +24,25 @@ mp_add (mp_int * a, mp_int * b, mp_int * c)
   sa = a->sign;
   sb = b->sign;
 
-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive */
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
     res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a + -b == a - b, but if b>a then we do it as -(b-a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a + b == b - a, but if a>b then we do it as -(a-b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
-    }
   } else {
-    /* -a + -b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
   }
   return res;
 }
+
diff --git a/bn_mp_cmp.c b/bn_mp_cmp.c
index 391eca3..4bf8082 100644
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@@ -21,8 +21,17 @@ mp_cmp (mp_int * a, mp_int * b)
   /* compare based on sign */
   if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
     return MP_LT;
-  } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+  } 
+  
+  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
     return MP_GT;
   }
-  return mp_cmp_mag (a, b);
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
 }
diff --git a/bn_mp_cmp_mag.c b/bn_mp_cmp_mag.c
index a40b518..87b56d6 100644
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@@ -23,7 +23,9 @@ mp_cmp_mag (mp_int * a, mp_int * b)
   /* compare based on # of non-zero digits */
   if (a->used > b->used) {
     return MP_GT;
-  } else if (a->used < b->used) {
+  } 
+  
+  if (a->used < b->used) {
     return MP_LT;
   }
 
@@ -31,7 +33,9 @@ mp_cmp_mag (mp_int * a, mp_int * b)
   for (n = a->used - 1; n >= 0; n--) {
     if (a->dp[n] > b->dp[n]) {
       return MP_GT;
-    } else if (a->dp[n] < b->dp[n]) {
+    } 
+    
+    if (a->dp[n] < b->dp[n]) {
       return MP_LT;
     }
   }
diff --git a/bn_mp_copy.c b/bn_mp_copy.c
index 1bf5f12..ebdca5a 100644
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@@ -31,13 +31,10 @@ mp_copy (mp_int * a, mp_int * b)
   }
 
   /* zero b and copy the parameters over */
-  b->used = a->used;
-  b->sign = a->sign;
-
   {
     register mp_digit *tmpa, *tmpb;
 
-    /* point aliases */
+    /* pointer aliases */
     tmpa = a->dp;
     tmpb = b->dp;
 
@@ -47,9 +44,11 @@ mp_copy (mp_int * a, mp_int * b)
     }
 
     /* clear high digits */
-    for (; n < b->alloc; n++) {
+    for (; n < b->used; n++) {
       *tmpb++ = 0;
     }
   }
+  b->used = a->used;
+  b->sign = a->sign;
   return MP_OKAY;
 }
diff --git a/bn_mp_div.c b/bn_mp_div.c
index 3888a4b..3ba609d 100644
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@@ -75,7 +75,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 
   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
   norm = mp_count_bits(&y) % DIGIT_BIT;
-  if (norm < (DIGIT_BIT-1)) {
+  if (norm < (int)(DIGIT_BIT-1)) {
      norm = (DIGIT_BIT-1) - norm;
      if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
        goto __Y;
@@ -86,13 +86,13 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   } else {
      norm = 0;
   }
-     
+
   /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
   n = x.used - 1;
   t = y.used - 1;
 
   /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) {	/* y = y*b^{n-t} */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
     goto __Y;
   }
 
@@ -113,14 +113,14 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 
     /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
     if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
     } else {
       mp_word tmp;
       tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
       tmp |= ((mp_word) x.dp[i - 1]);
       tmp /= ((mp_word) y.dp[t]);
       if (tmp > (mp_word) MP_MASK)
-	tmp = MP_MASK;
+        tmp = MP_MASK;
       q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
     }
 
@@ -135,7 +135,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       t1.dp[1] = y.dp[t];
       t1.used = 2;
       if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
       }
 
       /* find right hand */
@@ -143,7 +143,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
       t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
       t2.dp[2] = x.dp[i];
       t2.used = 3;
-    } while (mp_cmp (&t1, &t2) == MP_GT);
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
 
     /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
     if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
@@ -161,19 +161,19 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
     /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
     if (x.sign == MP_NEG) {
       if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
       }
       if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
       }
       if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-	goto __Y;
+        goto __Y;
       }
 
       q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
     }
   }
-  
+
   /* now q is the quotient and x is the remainder [which we have to normalize] */
   /* get sign before writing to c */
   x.sign = a->sign;
diff --git a/bn_mp_div_2.c b/bn_mp_div_2.c
index 858e8a4..1ade93c 100644
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@@ -34,19 +34,19 @@ mp_div_2 (mp_int * a, mp_int * b)
 
     /* source alias */
     tmpa = a->dp + b->used - 1;
-    
+
     /* dest alias */
     tmpb = b->dp + b->used - 1;
-    
+
     /* carry */
     r = 0;
     for (x = b->used - 1; x >= 0; x--) {
       /* get the carry for the next iteration */
       rr = *tmpa & 1;
-      
+
       /* shift the current digit, add in carry and store */
       *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-      
+
       /* forward carry to next iteration */
       r = rr;
     }
diff --git a/bn_mp_div_2d.c b/bn_mp_div_2d.c
index 75501a4..f050c29 100644
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@@ -51,7 +51,7 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   }
 
   /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
+  if (b >= (int)DIGIT_BIT) {
     mp_rshd (c, b / DIGIT_BIT);
   }
 
@@ -59,13 +59,13 @@ mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
   D = (mp_digit) (b % DIGIT_BIT);
   if (D != 0) {
     register mp_digit *tmpc, mask;
-    
+
     /* mask */
-    mask = (1U << D) - 1U;
-    
+    mask = (((mp_digit)1) << D) - 1;
+
     /* alias */
     tmpc = c->dp + (c->used - 1);
-    
+
     /* carry */
     r = 0;
     for (x = c->used - 1; x >= 0; x--) {
diff --git a/bn_mp_dr_is_modulus.c b/bn_mp_dr_is_modulus.c
new file mode 100644
index 0000000..381af17
--- /dev/null
+++ b/bn_mp_dr_is_modulus.c
@@ -0,0 +1,34 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
diff --git a/bn_mp_dr_reduce.c b/bn_mp_dr_reduce.c
index 75fb7ba..c8488e0 100644
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@@ -16,7 +16,7 @@
 
 /* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
  *
- * Based on algorithm from the paper 
+ * Based on algorithm from the paper
  *
  * "Generating Efficient Primes for Discrete Log Cryptosystems"
  *                 Chae Hoon Lim, Pil Loong Lee,
@@ -40,15 +40,15 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
       return err;
     }
   }
- 
+
   /* alias for a->dp[i] */
   tmpi = a->dp + k + k - 1;
 
-  /* for (i = 2k - 1; i >= k; i = i - 1) 
+  /* for (i = 2k - 1; i >= k; i = i - 1)
    *
    * This is the main loop of the reduction.  Note that at the end
    * the words above position k are not zeroed as expected.  The end
-   * result is that the digits from 0 to k-1 are the residue.  So 
+   * result is that the digits from 0 to k-1 are the residue.  So
    * we have to clear those afterwards.
    */
   for (i = k + k - 1; i >= k; i = i - 1) {
@@ -57,10 +57,10 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
     /* x[i] * mp */
     r = ((mp_word) *tmpi--) * ((mp_word) mp);
 
-    /* now add r to x[i-1:i-k] 
+    /* now add r to x[i-1:i-k]
      *
      * First add it to the first digit x[i-k] then form the carry
-     * then enter the main loop 
+     * then enter the main loop
      */
     j = i - k;
 
@@ -74,14 +74,14 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
     mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);
 
     /* clear carry from a->dp[j]  */
-    *tmpj++ &= MP_MASK; 
+    *tmpj++ &= MP_MASK;
 
-    /* now add rest of the digits 
-     * 
+    /* now add rest of the digits
+     *
      * Note this is basically a simple single digit addition to
      * a larger multiple digit number.  This is optimized somewhat
      * because the propagation of carries is not likely to move
-     * more than a few digits. 
+     * more than a few digits.
      *
      */
     for (++j; mu != 0 && j <= (i - 1); ++j) {
@@ -99,16 +99,16 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
       *tmpj += mp;
       mu = *tmpj >> DIGIT_BIT;
       *tmpj++ &= MP_MASK;
-      
+
       /* now handle carries */
       for (++j; mu != 0 && j <= (i - 1); j++) {
-	*tmpj   += mu;
-	mu       = *tmpj >> DIGIT_BIT;
-	*tmpj++ &= MP_MASK;
+          *tmpj   += mu;
+          mu       = *tmpj >> DIGIT_BIT;
+          *tmpj++ &= MP_MASK;
       }
     }
   }
-  
+
   /* zero words above k */
   tmpi = a->dp + k;
   for (i = k; i < a->used; i++) {
@@ -117,34 +117,13 @@ mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
 
   /* clamp, sub and return */
   mp_clamp (a);
-  
+
+  /* if a >= b [b == modulus] then subtract the modulus to fix up */
   if (mp_cmp_mag (a, b) != MP_LT) {
     return s_mp_sub (a, b, a);
   }
   return MP_OKAY;
 }
 
-/* determines if a number is a valid DR modulus */
-int mp_dr_is_modulus(mp_int *a)
-{
-   int ix;
-   
-   /* must be at least two digits */
-   if (a->used < 2) {
-      return 0;
-   }      
-   
-   for (ix = 1; ix < a->used; ix++) {
-       if (a->dp[ix] != MP_MASK) {
-          return 0;
-       }
-   }
-   return 1;
-}
 
-/* determines the setup value */
-void mp_dr_setup(mp_int *a, mp_digit *d)
-{
-   *d = (1 << DIGIT_BIT) - a->dp[0];
-}
 
diff --git a/bn_mp_dr_setup.c b/bn_mp_dr_setup.c
new file mode 100644
index 0000000..62dba02
--- /dev/null
+++ b/bn_mp_dr_setup.c
@@ -0,0 +1,25 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0]));
+}
+
diff --git a/bn_mp_expt_d.c b/bn_mp_expt_d.c
index 144ae07..1f76830 100644
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@@ -35,11 +35,11 @@ mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
       return res;
     }
 
-    /* if the bit is set multiply */    
-    if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
       if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
-	mp_clear (&g);
-	return res;
+         mp_clear (&g);
+         return res;
       }
     }
 
diff --git a/bn_mp_exptmod.c b/bn_mp_exptmod.c
index b6635f5..573f760 100644
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@@ -17,7 +17,7 @@
 static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
 
 /* this is a shell function that calls either the normal or Montgomery
- * exptmod functions.  Originally the call to the montgomery code was 
+ * exptmod functions.  Originally the call to the montgomery code was
  * embedded in the normal function but that wasted alot of stack space
  * for nothing (since 99% of the time the Montgomery code would be called)
  */
@@ -25,10 +25,46 @@ int
 mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
   int dr;
-  
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)^|X| instead of G^X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+  }
+
+
   dr = mp_dr_is_modulus(P);
   /* if the modulus is odd use the fast method */
-  if (((mp_isodd (P) == 1 && P->used < MONTGOMERY_EXPT_CUTOFF) || dr == 1) && P->used > 4) {
+  if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) {
     return mp_exptmod_fast (G, X, P, Y, dr);
   } else {
     return f_mp_exptmod (G, X, P, Y);
@@ -60,11 +96,17 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     winsize = 8;
   }
 
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
   /* init G array */
   for (x = 0; x < (1 << winsize); x++) {
     if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
       for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
+        mp_clear (&M[y]);
       }
       return err;
     }
@@ -78,7 +120,7 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     goto __MU;
   }
 
-  /* create M table 
+  /* create M table
    *
    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
    *
@@ -119,30 +161,29 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
   mp_set (&res, 1);
 
   /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
   digidx = X->used - 1;
   bitcpy = bitbuf = 0;
 
-  bitcnt = 1;
   for (;;) {
     /* grab next digit as required */
     if (--bitcnt == 0) {
       if (digidx == -1) {
-	break;
+        break;
       }
       buf = X->dp[digidx--];
       bitcnt = (int) DIGIT_BIT;
     }
 
     /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
 
-    /* if the bit is zero and mode == 0 then we ignore it 
+    /* if the bit is zero and mode == 0 then we ignore it
      * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it 
+     * in the exponent.  Technically this opt is not required but it
      * does lower the # of trivial squaring/reductions used
      */
     if (mode == 0 && y == 0)
@@ -151,10 +192,10 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       continue;
     }
@@ -167,20 +208,20 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
       /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __MU;
+        goto __MU;
       }
       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __MU;
+        goto __MU;
       }
 
       /* empty window and reset */
@@ -194,21 +235,21 @@ f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
 
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
       }
     }
   }
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index 0906f27..7edf736 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -19,7 +19,7 @@
  * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
  * The value of k changes based on the size of the exponent.
  *
- * Uses Montgomery or Diminished Radix reduction [whichever appropriate] 
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
  */
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
@@ -28,7 +28,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   mp_digit buf, mp;
   int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   int     (*redux)(mp_int*,mp_int*,mp_digit);
-  
+
   /* find window size */
   x = mp_count_bits (X);
   if (x <= 7) {
@@ -47,22 +47,37 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     winsize = 8;
   }
 
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+
   /* init G array */
   for (x = 0; x < (1 << winsize); x++) {
     if ((err = mp_init (&M[x])) != MP_OKAY) {
       for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
+        mp_clear (&M[y]);
       }
       return err;
     }
   }
-  
+
   if (redmode == 0) {
      /* now setup montgomery  */
      if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
         goto __M;
      }
-     redux = mp_montgomery_reduce;
+     
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+     if ( ((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else {
+        /* use slower baselien method */
+        redux = mp_montgomery_reduce;
+     }
   } else {
      /* setup DR reduction */
      mp_dr_setup(P, &mp);
@@ -97,7 +112,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
         goto __RES;
      }
   }
-  
+
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
     goto __RES;
@@ -123,42 +138,42 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
   }
 
   /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
   digidx = X->used - 1;
   bitcpy = bitbuf = 0;
 
-  bitcnt = 1;
   for (;;) {
     /* grab next digit as required */
     if (--bitcnt == 0) {
       if (digidx == -1) {
-	break;
+        break;
       }
       buf = X->dp[digidx--];
       bitcnt = (int) DIGIT_BIT;
     }
 
     /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
+    y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
 
     /* if the bit is zero and mode == 0 then we ignore it
      * These represent the leading zero bits before the first 1 bit
      * in the exponent.  Technically this opt is not required but it
      * does lower the # of trivial squaring/reductions used
      */
-    if (mode == 0 && y == 0)
+    if (mode == 0 && y == 0) {
       continue;
+    }
 
     /* if the bit is zero and mode == 1 then we square */
     if (mode == 1 && y == 0) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       continue;
     }
@@ -171,20 +186,20 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
       /* ok window is filled so square as required and multiply  */
       /* square first */
       for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
       }
 
       /* then multiply */
       if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
 
       /* empty window and reset */
@@ -198,21 +213,21 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
     /* square then multiply if the bit is set */
     for (x = 0; x < bitcpy; x++) {
       if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
       if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
+        goto __RES;
       }
 
       bitbuf <<= 1;
       if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
       }
     }
   }
@@ -222,7 +237,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
        goto __RES;
      }
-  }     
+  }
 
   mp_exch (&res, Y);
   err = MP_OKAY;
diff --git a/bn_mp_gcd.c b/bn_mp_gcd.c
index d7cc1d4..1c930c7 100644
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@@ -82,18 +82,18 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     /* B3 (and B4).  Halve t, if even */
     while (t.used != 0 && mp_iseven(&t) == 1) {
       if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-	goto __T;
+        goto __T;
       }
     }
 
     /* B5.  if t>0 then u=t otherwise v=-t */
     if (t.used != 0 && t.sign != MP_NEG) {
       if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-	goto __T;
+        goto __T;
       }
     } else {
       if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-	goto __T;
+        goto __T;
       }
       v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
     }
@@ -102,9 +102,9 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
     if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
       goto __T;
     }
-  }
-  while (t.used != 0);
+  } while (mp_iszero(&t) == 0);
 
+  /* multiply by 2^k which we divided out at the beginning */ 
   if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
     goto __T;
   }
diff --git a/bn_mp_grow.c b/bn_mp_grow.c
index 9bd5118..2a8249c 100644
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@@ -18,12 +18,12 @@
 int
 mp_grow (mp_int * a, int size)
 {
-  int     i, n;
+  int     i;
 
   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
     /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
 
     a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
     if (a->dp == NULL) {
@@ -31,9 +31,9 @@ mp_grow (mp_int * a, int size)
     }
 
     /* zero excess digits */
-    n = a->alloc;
+    i        = a->alloc;
     a->alloc = size;
-    for (i = n; i < a->alloc; i++) {
+    for (; i < a->alloc; i++) {
       a->dp[i] = 0;
     }
   }
diff --git a/bn_mp_init.c b/bn_mp_init.c
index b96e6d9..3af7499 100644
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@@ -18,7 +18,6 @@
 int
 mp_init (mp_int * a)
 {
-
   /* allocate ram required and clear it */
   a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
   if (a->dp == NULL) {
diff --git a/bn_mp_invmod.c b/bn_mp_invmod.c
index 4e2c1f7..36ce092 100644
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@@ -29,63 +29,36 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   if (mp_iseven (b) == 0) {
     return fast_mp_invmod (a, b, c);
   }
-
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&A)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __A;
-  }
-
-  if ((res = mp_init (&C)) != MP_OKAY) {
-    goto __B;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __C;
+  
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
   }
 
   /* x = a, y = b */
   if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
 
   if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
 
   /* 2. [modified] if x,y are both even then return an error! */
   if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
   if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
+    goto __ERR;
   }
   mp_set (&A, 1);
   mp_set (&D, 1);
@@ -96,24 +69,24 @@ top:
   while (mp_iseven (&u) == 1) {
     /* 4.1 u = u/2 */
     if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 4.2 if A or B is odd then */
     if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
       /* A = (A+y)/2, B = (B-x)/2 */
       if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
       }
       if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
       }
     }
     /* A = A/2, B = B/2 */
     if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -122,24 +95,24 @@ top:
   while (mp_iseven (&v) == 1) {
     /* 5.1 v = v/2 */
     if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     /* 5.2 if C,D are even then */
     if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
       /* C = (C+y)/2, D = (D-x)/2 */
       if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
       }
       if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
+	goto __ERR;
       }
     }
     /* C = C/2, D = D/2 */
     if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
     if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -147,28 +120,28 @@ top:
   if (mp_cmp (&u, &v) != MP_LT) {
     /* u = u - v, A = A - C, B = B - D */
     if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   } else {
     /* v - v - u, C = C - A, D = D - B */
     if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
 
     if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
+      goto __ERR;
     }
   }
 
@@ -181,21 +154,13 @@ top:
   /* if v != 1 then there is no inverse */
   if (mp_cmp_d (&v, 1) != MP_EQ) {
     res = MP_VAL;
-    goto __D;
+    goto __ERR;
   }
 
   /* a is now the inverse */
   mp_exch (&C, c);
   res = MP_OKAY;
 
-__D:mp_clear (&D);
-__C:mp_clear (&C);
-__B:mp_clear (&B);
-__A:mp_clear (&A);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
+__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
   return res;
 }
diff --git a/bn_mp_jacobi.c b/bn_mp_jacobi.c
index bfe7bfc..1a7573d 100644
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@@ -14,7 +14,7 @@
  */
 #include <tommath.h>
 
-/* computes the jacobi c = (a | n) (or Legendre if b is prime)
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
  * HAC pp. 73 Algorithm 2.149
  */
 int
diff --git a/bn_mp_karatsuba_mul.c b/bn_mp_karatsuba_mul.c
index 79358fb..f720a11 100644
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@@ -36,7 +36,7 @@
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int  x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
   int     B, err;
 
   err = MP_MEM;
@@ -60,10 +60,8 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   /* init temps */
   if (mp_init_size (&t1, B * 2) != MP_OKAY)
     goto Y1;
-  if (mp_init_size (&t2, B * 2) != MP_OKAY)
-    goto T1;
   if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-    goto T2;
+    goto T1;
   if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
     goto X0Y0;
 
@@ -110,41 +108,40 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
   mp_clamp (&y0);
 
   /* now calc the products x0y0 and x1y1 */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
-    goto X1Y1;			/* x0y0 = x0*y0 */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  /* after this x0 is no longer required, free temp [x0==t2]! */
+    goto X1Y1;          /* x0y0 = x0*y0 */
   if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1*y1 */
+    goto X1Y1;          /* x1y1 = x1*y1 */
 
   /* now calc x1-x0 and y1-y0 */
   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = y1 - y0 */
-  if (mp_mul (&t1, &t2, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
 
   /* add x0y0 */
-  if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = x0y0 + x1y1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
   if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1y1 << 2*B */
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
 
   if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 */
+    goto X1Y1;          /* t1 = x0y0 + t1 */
   if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 + x1y1 */
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
 
   err = MP_OKAY;
 
 X1Y1:mp_clear (&x1y1);
 X0Y0:mp_clear (&x0y0);
-T2:mp_clear (&t2);
 T1:mp_clear (&t1);
 Y1:mp_clear (&y1);
 Y0:mp_clear (&y0);
diff --git a/bn_mp_karatsuba_sqr.c b/bn_mp_karatsuba_sqr.c
index 241b392..c3da38a 100644
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@@ -74,32 +74,32 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)
 
   /* now calc the products x0*x0 and x1*x1 */
   if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;			/* x0x0 = x0*x0 */
+    goto X1X1;                  /* x0x0 = x0*x0 */
   if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;			/* x1x1 = x1*x1 */
+    goto X1X1;                  /* x1x1 = x1*x1 */
 
-  /* now calc x1-x0 and y1-y0 */
+  /* now calc (x1-x0)^2 */
   if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x1 - x0 */
+    goto X1X1;                  /* t1 = x1 - x0 */
   if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = (x1 - x0) * (y1 - y0) */
+    goto X1X1;                  /* t1 = (x1 - x0) * (x1 - x0) */
 
   /* add x0y0 */
   if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;			/* t2 = x0y0 + x1y1 */
+    goto X1X1;                  /* t2 = x0y0 + x1y1 */
   if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+    goto X1X1;                  /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
 
   /* shift by B */
   if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+    goto X1X1;                  /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
   if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;			/* x1y1 = x1y1 << 2*B */
+    goto X1X1;                  /* x1y1 = x1y1 << 2*B */
 
   if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 */
+    goto X1X1;                  /* t1 = x0y0 + t1 */
   if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 + x1y1 */
+    goto X1X1;                  /* t1 = x0y0 + t1 + x1y1 */
 
   err = MP_OKAY;
 
diff --git a/bn_mp_lshd.c b/bn_mp_lshd.c
index 600afda..87a376b 100644
--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@@ -20,15 +20,16 @@ mp_lshd (mp_int * a, int b)
 {
   int     x, res;
 
-
   /* if its less than zero return */
   if (b <= 0) {
     return MP_OKAY;
   }
 
   /* grow to fit the new digits */
-  if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
-    return res;
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
   }
 
   {
diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c
index b942eba..a1ff2cd 100644
--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@@ -15,10 +15,10 @@
 #include <tommath.h>
 
 /* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].  
+ * Where B is the base [e.g. 2^DIGIT_BIT].
  * B^n mod b is computed by first computing
  * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT 
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT
  * shifts with subtractions when the result is greater than b.
  *
  * The method is slightly modified to shift B unconditionally upto just under
@@ -38,13 +38,13 @@ mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   }
 
   /* now compute C = A * B mod b */
-  for (x = bits - 1; x < DIGIT_BIT; x++) {
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
     if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
       return res;
     }
     if (mp_cmp_mag (a, b) != MP_LT) {
       if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
-	return res;
+        return res;
       }
     }
   }
diff --git a/bn_mp_montgomery_reduce.c b/bn_mp_montgomery_reduce.c
index e64435c..69a5364 100644
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@@ -21,12 +21,19 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
   int     ix, res, digs;
   mp_digit ui;
 
+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mp_mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
   digs = m->used * 2 + 1;
-  if ((digs < 512)
-      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+  if ((digs < MP_WARRAY)
+      && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
     return fast_mp_montgomery_reduce (a, m, mp);
   }
 
+  /* grow the input as required */
   if (a->alloc < m->used * 2 + 1) {
     if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
       return res;
@@ -50,15 +57,15 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 
       mu = 0;
       for (iy = 0; iy < m->used; iy++) {
-	r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
-	mu = (r >> ((mp_word) DIGIT_BIT));
-	*tmpy++ = (r & ((mp_word) MP_MASK));
+        r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
+        mu = (r >> ((mp_word) DIGIT_BIT));
+        *tmpy++ = (r & ((mp_word) MP_MASK));
       }
       /* propagate carries */
       while (mu) {
-	*tmpy += mu;
-	mu = (*tmpy >> DIGIT_BIT) & 1;
-	*tmpy++ &= MP_MASK;
+        *tmpy += mu;
+        mu = (*tmpy >> DIGIT_BIT) & 1;
+        *tmpy++ &= MP_MASK;
       }
     }
   }
diff --git a/bn_mp_montgomery_setup.c b/bn_mp_montgomery_setup.c
index dfdc51a..e59fab6 100644
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@@ -18,11 +18,11 @@
 int
 mp_montgomery_setup (mp_int * a, mp_digit * mp)
 {
-  unsigned long x, b;
+  mp_digit x, b;
 
-/* fast inversion mod 2^32 
+/* fast inversion mod 2^k
  *
- * Based on the fact that 
+ * Based on the fact that
  *
  * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
  *                   =>  2*X*A - X*X*A*A = 1
@@ -34,13 +34,20 @@ mp_montgomery_setup (mp_int * a, mp_digit * mp)
     return MP_VAL;
   }
 
-  x = (((b + 2) & 4) << 1) + b;	/* here x*a==1 mod 2^4 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^8 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^16; each step doubles the nb of bits */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^32 */
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2^8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2^16; each step doubles the nb of bits */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2^32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2^64 */
+#endif
 
   /* t = -1/m mod b */
-  *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - (x & MP_MASK);
+  *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
 
   return MP_OKAY;
 }
diff --git a/bn_mp_mul.c b/bn_mp_mul.c
index 5ccd6e4..258cb84 100644
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@@ -24,15 +24,15 @@ mp_mul (mp_int * a, mp_int * b, mp_int * c)
     res = mp_karatsuba_mul (a, b, c);
   } else {
 
-    /* can we use the fast multiplier? 
+    /* can we use the fast multiplier?
      *
-     * The fast multiplier can be used if the output will have less than 
-     * 512 digits and the number of digits won't affect carry propagation
+     * The fast multiplier can be used if the output will have less than
+     * MP_WARRAY digits and the number of digits won't affect carry propagation
      */
     int     digs = a->used + b->used + 1;
 
-    if ((digs < 512)
-	&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    if ((digs < MP_WARRAY)
+        && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
       res = fast_s_mp_mul_digs (a, b, c, digs);
     } else {
       res = s_mp_mul (a, b, c);
diff --git a/bn_mp_mul_2.c b/bn_mp_mul_2.c
index fd8db1f..2bfc939 100644
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@@ -20,10 +20,9 @@ mp_mul_2 (mp_int * a, mp_int * b)
 {
   int     x, res, oldused;
 
-  /* Optimization: should copy and shift at the same time */
-
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
       return res;
     }
   }
@@ -31,7 +30,6 @@ mp_mul_2 (mp_int * a, mp_int * b)
   oldused = b->used;
   b->used = a->used;
 
-  /* shift any bit count < DIGIT_BIT */
   {
     register mp_digit r, rr, *tmpa, *tmpb;
 
@@ -43,37 +41,32 @@ mp_mul_2 (mp_int * a, mp_int * b)
 
     /* carry */
     r = 0;
-    for (x = 0; x < b->used; x++) {
+    for (x = 0; x < a->used; x++) {
     
-      /* get what will be the *next* carry bit from the MSB of the current digit */
-      rr = *tmpa >> (DIGIT_BIT - 1);
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
       
       /* now shift up this digit, add in the carry [from the previous] */
-      *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
       
-      /* copy the carry that would be from the source digit into the next iteration */
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
       r = rr;
     }
 
     /* new leading digit? */
     if (r != 0) {
-      /* do we have to grow to accomodate the new digit? */
-      if (b->alloc == b->used) {
-	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
-	  return res;
-	}
-
-	/* after the grow *tmpb is no longer valid so we have to reset it! 
-	 * (this bug took me about 17 minutes to find...!)
-	 */
-	tmpb = b->dp + b->used;
-      }
       /* add a MSB which is always 1 at this point */
       *tmpb = 1;
       ++b->used;
     }
 
-    /* now zero any excess digits on the destination that we didn't write to */
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
     tmpb = b->dp + b->used;
     for (x = b->used; x < oldused; x++) {
       *tmpb++ = 0;
diff --git a/bn_mp_mul_2d.c b/bn_mp_mul_2d.c
index 137df30..ded3a3c 100644
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@@ -14,24 +14,34 @@
  */
 #include <tommath.h>
 
+/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
+   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
+ 
+   To be fixed for LTM 0.18
+ */
+
 /* shift left by a certain bit count */
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
 {
-  mp_digit d, r, rr;
-  int     x, res;
+  mp_digit d;
+  int      res;
 
   /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
   }
 
-  if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+       return res;
+     }
   }
 
   /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
+  if (b >= (int)DIGIT_BIT) {
     if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
       return res;
     }
@@ -41,14 +51,15 @@ mp_mul_2d (mp_int * a, int b, mp_int * c)
   /* shift any bit count < DIGIT_BIT */
   d = (mp_digit) (b % DIGIT_BIT);
   if (d != 0) {
-    register mp_digit *tmpc, mask;
-    
+    register mp_digit *tmpc, mask, r, rr;
+    register int x;
+
     /* bitmask for carries */
-    mask = (1U << d) - 1U;
-    
+    mask = (((mp_digit)1) << d) - 1;
+
     /* alias */
     tmpc = c->dp;
-    
+
     /* carry */
     r    = 0;
     for (x = 0; x < c->used; x++) {
diff --git a/bn_mp_mul_d.c b/bn_mp_mul_d.c
index f4458bb..f17a9fb 100644
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@@ -20,6 +20,7 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
   int     res, pa, olduse;
 
+  /* make sure c is big enough to hold a*b */
   pa = a->used;
   if (c->alloc < pa + 1) {
     if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
@@ -27,7 +28,10 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     }
   }
 
+  /* get the original destinations used count */
   olduse = c->used;
+
+  /* set the new temporary used count */
   c->used = pa + 1;
 
   {
@@ -35,21 +39,31 @@ mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
     register mp_word r;
     register int ix;
 
-    tmpc = c->dp + c->used;
-    for (ix = c->used; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-
+    /* alias for a->dp [source] */
     tmpa = a->dp;
+
+    /* alias for c->dp [dest] */
     tmpc = c->dp;
 
+    /* zero carry */
     u = 0;
     for (ix = 0; ix < pa; ix++) {
+      /* compute product and carry sum for this term */
       r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+
+      /* mask off higher bits to get a single digit */
       *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* send carry into next iteration */
       u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
     }
-    *tmpc = u;
+    /* store final carry [if any] */
+    *tmpc++ = u;
+
+    /* now zero digits above the top */
+    for (; pa < olduse; pa++) {
+       *tmpc++ = 0;
+    }
   }
 
   mp_clamp (c);
diff --git a/bn_mp_multi.c b/bn_mp_multi.c
new file mode 100644
index 0000000..ef96dc6
--- /dev/null
+++ b/bn_mp_multi.c
@@ -0,0 +1,64 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
diff --git a/bn_mp_prime_is_divisible.c b/bn_mp_prime_is_divisible.c
index dac2d0e..5b81104 100644
--- a/bn_mp_prime_is_divisible.c
+++ b/bn_mp_prime_is_divisible.c
@@ -14,7 +14,7 @@
  */
 #include <tommath.h>
 
-/* determines if an integers is divisible by one of the first 256 primes or not 
+/* determines if an integers is divisible by one of the first 256 primes or not
  *
  * sets result to 0 if not, 1 if yes
  */
@@ -27,7 +27,7 @@ mp_prime_is_divisible (mp_int * a, int *result)
   /* default to not */
   *result = 0;
 
-  for (ix = 0; ix < 256; ix++) {
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
     /* is it equal to the prime? */
     if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
       *result = 1;
diff --git a/bn_mp_prime_is_prime.c b/bn_mp_prime_is_prime.c
index 8910c87..1a782b3 100644
--- a/bn_mp_prime_is_prime.c
+++ b/bn_mp_prime_is_prime.c
@@ -31,10 +31,18 @@ mp_prime_is_prime (mp_int * a, int t, int *result)
   *result = 0;
 
   /* valid value of t? */
-  if (t < 1 || t > 256) {
+  if (t < 1 || t > PRIME_SIZE) {
     return MP_VAL;
   }
 
+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
   /* first perform trial division */
   if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
     return err;
diff --git a/bn_mp_prime_next_prime.c b/bn_mp_prime_next_prime.c
index 932d914..cfebbe5 100644
--- a/bn_mp_prime_next_prime.c
+++ b/bn_mp_prime_next_prime.c
@@ -20,35 +20,35 @@
 int mp_prime_next_prime(mp_int *a, int t)
 {
    int err, res;
-   
+
    if (mp_iseven(a) == 1) {
       /* force odd */
       if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
          return err;
       }
    } else {
-      /* force to next number */
+      /* force to next odd number */
       if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
          return err;
       }
-   }     
-   
+   }
+
    for (;;) {
       /* is this prime? */
       if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
          return err;
       }
-      
+
       if (res == 1) {
          break;
       }
-      
+
       /* add two, next candidate */
       if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
          return err;
       }
    }
-   
+
    return MP_OKAY;
 }
 
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
index 5d85f42..d98dc08 100644
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@@ -21,8 +21,7 @@ int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
   int     res;
-
-
+  
   if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
     return res;
   }
@@ -30,8 +29,8 @@ mp_reduce_setup (mp_int * a, mp_int * b)
   return res;
 }
 
-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
- * From HAC pp.604 Algorithm 14.42 
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup
+ * From HAC pp.604 Algorithm 14.42
  */
 int
 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
@@ -39,15 +38,15 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
   mp_int  q;
   int     res, um = m->used;
 
-
   if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
     return res;
   }
 
-  mp_rshd (&q, um - 1);		/* q1 = x / b^(k-1)  */
+  /* q1 = x / b^(k-1)  */
+  mp_rshd (&q, um - 1);         
 
   /* according to HAC this is optimization is ok */
-  if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) {
+  if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
     if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
       goto CLEANUP;
     }
@@ -57,7 +56,8 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
     }
   }
 
-  mp_rshd (&q, um + 1);		/* q3 = q2 / b^(k+1) */
+  /* q3 = q2 / b^(k+1) */
+  mp_rshd (&q, um + 1);         
 
   /* x = x mod b^(k+1), quick (no division) */
   if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
@@ -70,8 +70,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
   }
 
   /* x = x - q */
-  if ((res = mp_sub (x, &q, x)) != MP_OKAY)
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
     goto CLEANUP;
+  }
 
   /* If x < 0, add b^(k+1) to it */
   if (mp_cmp_d (x, 0) == MP_LT) {
@@ -84,8 +85,9 @@ mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 
   /* Back off if it's too big */
   while (mp_cmp (x, m) != MP_LT) {
-    if ((res = s_mp_sub (x, m, x)) != MP_OKAY)
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
       break;
+    }
   }
 
 CLEANUP:
diff --git a/bn_mp_rshd.c b/bn_mp_rshd.c
index 582c8c5..a703dda 100644
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@@ -26,7 +26,7 @@ mp_rshd (mp_int * a, int b)
   }
 
   /* if b > used then simply zero it and return */
-  if (a->used < b) {
+  if (a->used <= b) {
     mp_zero (a);
     return;
   }
@@ -42,8 +42,9 @@ mp_rshd (mp_int * a, int b)
     /* offset into digits */
     tmpaa = a->dp + b;
 
-    /* this is implemented as a sliding window where the window is b-digits long
-     * and digits from the top of the window are copied to the bottom
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
      *
      * e.g.
 
diff --git a/bn_mp_set_int.c b/bn_mp_set_int.c
index 1d6bce7..69a55a8 100644
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@@ -16,15 +16,13 @@
 
 /* set a 32-bit const */
 int
-mp_set_int (mp_int * a, unsigned long b)
+mp_set_int (mp_int * a, unsigned int b)
 {
   int     x, res;
 
   mp_zero (a);
-
-  /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
+  /* set four bits at a time */
   for (x = 0; x < 8; x++) {
-
     /* shift the number up four bits */
     if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
       return res;
@@ -37,9 +35,8 @@ mp_set_int (mp_int * a, unsigned long b)
     b <<= 4;
 
     /* ensure that digits are not clamped off */
-    a->used += 32 / DIGIT_BIT + 1;
+    a->used += 32 / DIGIT_BIT + 2;
   }
-
   mp_clamp (a);
   return MP_OKAY;
 }
diff --git a/bn_mp_sqr.c b/bn_mp_sqr.c
index 99ebdf0..c530c9a 100644
--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@@ -24,8 +24,7 @@ mp_sqr (mp_int * a, mp_int * b)
   } else {
 
     /* can we use the fast multiplier? */
-    if (((a->used * 2 + 1) < 512)
-	&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
+    if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
       res = fast_s_mp_sqr (a, b);
     } else {
       res = s_mp_sqr (a, b);
diff --git a/bn_mp_sub.c b/bn_mp_sub.c
index 6558e5d..2bc4123 100644
--- a/bn_mp_sub.c
+++ b/bn_mp_sub.c
@@ -20,39 +20,34 @@ mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
   int     sa, sb, res;
 
-
   sa = a->sign;
   sb = b->sign;
 
-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive, a - b, but if b>a then we do -(b - a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      /* b>a */
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a - -b == a + b  */
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
     res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a - b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
   } else {
-    /* -a - -b == b - a, but if a>b == -(a - b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
       res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
     } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
       res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
     }
   }
-
   return res;
 }
+
diff --git a/bn_prime_tab.c b/bn_prime_tab.c
index e663578..83c5469 100644
--- a/bn_prime_tab.c
+++ b/bn_prime_tab.c
@@ -17,7 +17,9 @@ const mp_digit __prime_tab[] = {
   0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
   0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
   0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
   0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
   0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
   0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
@@ -49,4 +51,5 @@ const mp_digit __prime_tab[] = {
   0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
   0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
   0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
 };
diff --git a/bn_radix.c b/bn_radix.c
index 3b4b639..f586d46 100644
--- a/bn_radix.c
+++ b/bn_radix.c
@@ -135,3 +135,80 @@ mp_radix_size (mp_int * a, int radix)
   mp_clear (&t);
   return digs + 1;
 }
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
diff --git a/bn_reverse.c b/bn_reverse.c
index c24aa27..4e785c4 100644
--- a/bn_reverse.c
+++ b/bn_reverse.c
@@ -24,7 +24,7 @@ bn_reverse (unsigned char *s, int len)
   ix = 0;
   iy = len - 1;
   while (ix < iy) {
-    t = s[ix];
+    t     = s[ix];
     s[ix] = s[iy];
     s[iy] = t;
     ++ix;
diff --git a/bn_s_mp_add.c b/bn_s_mp_add.c
index ceb2702..87aab4e 100644
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@@ -28,13 +28,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
     min = b->used;
     max = a->used;
     x = a;
-  } else if (a->used < b->used) {
+  } else {
     min = a->used;
     max = b->used;
     x = b;
-  } else {
-    min = max = a->used;
-    x = NULL;
   }
 
   /* init result */
@@ -44,11 +41,10 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
     }
   }
 
+  /* get old used digit count and set new one */
   olduse = c->used;
   c->used = max + 1;
 
-  /* add digits from lower part */
-
   /* set the carry to zero */
   {
     register mp_digit u, *tmpa, *tmpb, *tmpc;
@@ -65,36 +61,39 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
     /* destination */
     tmpc = c->dp;
 
+    /* zero the carry */
     u = 0;
     for (i = 0; i < min; i++) {
       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
       *tmpc = *tmpa++ + *tmpb++ + u;
 
       /* U = carry bit of T[i] */
-      u = *tmpc >> DIGIT_BIT;
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);
 
       /* take away carry bit from T[i] */
       *tmpc++ &= MP_MASK;
     }
 
-    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
     if (min != max) {
       for (; i < max; i++) {
-	/* T[i] = X[i] + U */
-	*tmpc = x->dp[i] + u;
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;
 
-	/* U = carry bit of T[i] */
-	u = *tmpc >> DIGIT_BIT;
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);
 
-	/* take away carry bit from T[i] */
-	*tmpc++ &= MP_MASK;
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
       }
     }
 
     /* add carry */
     *tmpc++ = u;
 
-    /* clear digits above used (since we may not have grown result above) */
+    /* clear digits above oldused */
     for (i = c->used; i < olduse; i++) {
       *tmpc++ = 0;
     }
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
index 0243449..c126a0c 100644
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@@ -15,8 +15,8 @@
 #include <tommath.h>
 
 /* multiplies |a| * |b| and only computes upto digs digits of result
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
- * output are created.  
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
  */
 int
 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
@@ -27,6 +27,13 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   mp_word r;
   mp_digit tmpx, *tmpt, *tmpy;
 
+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
   if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
     return res;
   }
@@ -42,14 +49,21 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
     pb = MIN (b->used, digs - ix);
 
     /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
     tmpx = a->dp[ix];
-    tmpt = &(t.dp[ix]);
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
     tmpy = b->dp;
 
     /* compute the columns of the output and propagate the carry */
     for (iy = 0; iy < pb; iy++) {
       /* compute the column as a mp_word */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+      r = ((mp_word) *tmpt) + 
+          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word) u);
 
       /* the new column is the lower part of the result */
       *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
@@ -57,8 +71,10 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       /* get the carry word from the result */
       u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
     }
-    if (ix + iy < digs)
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
       *tmpt = u;
+    }
   }
 
   mp_clamp (&t);
diff --git a/bn_s_mp_mul_high_digs.c b/bn_s_mp_mul_high_digs.c
index ba52d11..bbe7378 100644
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@@ -14,7 +14,7 @@
  */
 #include <tommath.h>
 
-/* multiplies |a| * |b| and does not compute the lower digs digits 
+/* multiplies |a| * |b| and does not compute the lower digs digits
  * [meant to get the higher part of the product]
  */
 int
@@ -28,8 +28,8 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 
 
   /* can we use the fast multiplier? */
-  if (((a->used + b->used + 1) < 512)
-      && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
     return fast_s_mp_mul_high_digs (a, b, c, digs);
   }
 
diff --git a/bn_s_mp_sub.c b/bn_s_mp_sub.c
index a5683dd..5f22999 100644
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@@ -14,7 +14,7 @@
  */
 #include <tommath.h>
 
-/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
 int
 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
@@ -34,7 +34,6 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
   c->used = max;
 
   /* sub digits from lower part */
-
   {
     register mp_digit u, *tmpa, *tmpb, *tmpc;
     register int i;
@@ -50,12 +49,12 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
       /* T[i] = A[i] - B[i] - U */
       *tmpc = *tmpa++ - *tmpb++ - u;
 
-      /* U = carry bit of T[i] 
-       * Note this saves performing an AND operation since 
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
        * if a carry does occur it will propagate all the way to the
        * MSB.  As a result a single shift is required to get the carry
        */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
 
       /* Clear carry from T[i] */
       *tmpc++ &= MP_MASK;
@@ -67,7 +66,7 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
       *tmpc = *tmpa++ - u;
 
       /* U = carry bit of T[i] */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
 
       /* Clear carry from T[i] */
       *tmpc++ &= MP_MASK;
diff --git a/bncore.c b/bncore.c
index 3660c6d..7e7ac50 100644
--- a/bncore.c
+++ b/bncore.c
@@ -14,7 +14,15 @@
  */
 #include <tommath.h>
 
-/* configured for a AMD Duron Morgan core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 73,	/* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 121,	/* Min. number of digits before Karatsuba squaring is used. */
-        MONTGOMERY_EXPT_CUTOFF = 128;	/* max. number of digits that montgomery reductions will help for */
+/* Known optimal configurations
+
+ CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
+-------------------------------------------------------------
+ Intel P4               /GCC v3.2     /        81/       110
+ AMD Athlon XP          /GCC v3.2     /       109/       127
+
+*/
+
+/* configured for a AMD XP Thoroughbred core with etc/tune.c */
+int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 127;      /* Min. number of digits before Karatsuba squaring is used. */
diff --git a/booker.pl b/booker.pl
new file mode 100644
index 0000000..5bc6645
--- /dev/null
+++ b/booker.pl
@@ -0,0 +1,261 @@
+#!/bin/perl
+#
+#Used to prepare the book "tommath.src" for LaTeX by pre-processing it into a .tex file
+#
+#Essentially you write the "tommath.src" as normal LaTex except where you want code snippets you put
+#
+#EXAM,file
+#
+#This preprocessor will then open "file" and insert it as a verbatim copy.
+#
+#Tom St Denis
+
+#get graphics type
+if (shift =~ /PDF/) {
+   $graph = "";
+} else {
+   $graph = ".ps";
+}   
+
+open(IN,"<tommath.src") or die "Can't open source file";
+open(OUT,">tommath.tex") or die "Can't open destination file";
+
+print "Scanning for sections\n";
+$chapter = $section = $subsection = 0;
+$x = 0;
+while (<IN>) {
+   print ".";
+   if (!(++$x % 80)) { print "\n"; }
+   #update the headings 
+   if (~($_ =~ /\*/)) {
+      if ($_ =~ /\\chapter{.+}/) {
+          ++$chapter;
+          $section = $subsection = 0;
+      } elsif ($_ =~ /\\section{.+}/) {
+          ++$section;
+          $subsection = 0;
+      } elsif ($_ =~ /\\subsection{.+}/) {
+          ++$subsection;
+      }
+   }      
+
+   if ($_ =~ m/MARK/) {
+      @m = split(",",$_);
+      chomp(@m[1]);
+      $index1{@m[1]} = $chapter;
+      $index2{@m[1]} = $section;
+      $index3{@m[1]} = $subsection;
+   }
+}
+close(IN);
+
+open(IN,"<tommath.src") or die "Can't open source file";
+$readline = $wroteline = 0;
+$srcline = 0;
+
+while (<IN>) {
+   ++$readline;
+   ++$srcline;
+   
+   if ($_ =~ m/MARK/) {
+   } elsif ($_ =~ m/EXAM/ || $_ =~ m/LIST/) {
+      if ($_ =~ m/EXAM/) {
+         $skipheader = 1;
+      } else {
+         $skipheader = 0;
+      }
+      
+      # EXAM,file
+      chomp($_);
+      @m = split(",",$_);
+      open(SRC,"<$m[1]") or die "Error:$srcline:Can't open source file $m[1]";
+      
+      print "$srcline:Inserting $m[1]:";
+      
+      $line = 0;
+      $tmp = $m[1];
+      $tmp =~ s/_/"\\_"/ge;
+      print OUT "\\index{$tmp}\n\\vspace{+3mm}\\begin{small}\n\\hspace{-5.1mm}{\\bf File}: $tmp\n\\vspace{-3mm}\n\\begin{alltt}\n";
+      $wroteline += 5;
+      
+      if ($skipheader == 1) {
+         # scan till next end of comment, e.g. skip license 
+         while (<SRC>) {
+            $text[$line++] = $_;
+            last if ($_ =~ /tommath\.h/);
+         }
+      }
+      
+      $inline = 0;
+      while (<SRC>) {
+         $text[$line++] = $_;
+         ++$inline;
+         chomp($_);
+         $_ =~ s/\t/"    "/ge;
+         $_ =~ s/{/"^{"/ge;
+         $_ =~ s/}/"^}"/ge;
+         $_ =~ s/\\/'\symbol{92}'/ge;
+         $_ =~ s/\^/"\\"/ge;
+           
+         printf OUT ("%03d   ", $line);
+         for ($x = 0; $x < length($_); $x++) {
+             print OUT chr(vec($_, $x, 8));
+             if ($x == 75) { 
+                 print OUT "\n      ";
+                 ++$wroteline;
+             }
+         }
+         print OUT "\n";
+         ++$wroteline;
+      }
+      $totlines = $line;
+      print OUT "\\end{alltt}\n\\end{small}\n";
+      close(SRC);
+      print "$inline lines\n";
+      $wroteline += 2;
+   } elsif ($_ =~ m/@\d+,.+@/) {
+     # line contains [number,text]
+     # e.g. @14,for (ix = 0)@
+     $txt = $_;
+     while ($txt =~ m/@\d+,.+@/) {
+        @m = split("@",$txt);      # splits into text, one, two
+        @parms = split(",",$m[1]);  # splits one,two into two elements 
+                
+        # now search from $parms[0] down for $parms[1] 
+        $found1 = 0;
+        $found2 = 0;
+        for ($i = $parms[0]; $i < $totlines && $found1 == 0; $i++) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline1 = $i + 1;
+              $found1 = 1;
+           }
+        }
+        
+        # now search backwards
+        for ($i = $parms[0] - 1; $i >= 0 && $found2 == 0; $i--) {
+           if ($text[$i] =~ m/\Q$parms[1]\E/) {
+              $foundline2 = $i + 1;
+              $found2 = 1;
+           }
+        }
+        
+        # now use the closest match or the first if tied
+        if ($found1 == 1 && $found2 == 0) {
+           $found = 1;
+           $foundline = $foundline1;
+        } elsif ($found1 == 0 && $found2 == 1) {
+           $found = 1;
+           $foundline = $foundline2;
+        } elsif ($found1 == 1 && $found2 == 1) {
+           $found = 1;
+           if (($foundline1 - $parms[0]) <= ($parms[0] - $foundline2)) {
+              $foundline = $foundline1;
+           } else {
+              $foundline = $foundline2;
+           }
+        } else {
+           $found = 0;
+        }
+                      
+        # if found replace 
+        if ($found == 1) {
+           $delta = $parms[0] - $foundline;
+           print "Found replacement tag for \"$parms[1]\" on line $srcline which refers to line $foundline (delta $delta)\n";
+           $_ =~ s/@\Q$m[1]\E@/$foundline/;
+        } else {
+           print "ERROR:  The tag \"$parms[1]\" on line $srcline was not found in the most recently parsed source!\n";
+        }
+        
+        # remake the rest of the line 
+        $cnt = @m;
+        $txt = "";
+        for ($i = 2; $i < $cnt; $i++) {
+            $txt = $txt . $m[$i] . "@";
+        }
+     }
+     print OUT $_;
+     ++$wroteline;
+   } elsif ($_ =~ /~.+~/) {
+      # line contains a ~text~ pair used to refer to indexing :-)
+      $txt = $_;
+      while ($txt =~ /~.+~/) {
+         @m = split("~", $txt);
+         
+         # word is the second position
+         $word = @m[1];
+         $a = $index1{$word};
+         $b = $index2{$word};
+         $c = $index3{$word};
+         
+         # if chapter (a) is zero it wasn't found
+         if ($a == 0) {
+            print "ERROR: the tag \"$word\" on line $srcline was not found previously marked.\n";
+         } else {
+            # format the tag as x, x.y or x.y.z depending on the values
+            $str = $a;
+            $str = $str . ".$b" if ($b != 0);
+            $str = $str . ".$c" if ($c != 0);
+            
+            if ($b == 0 && $c == 0) {
+               # its a chapter
+               if ($a <= 10) {
+                  if ($a == 1) {
+                     $str = "chapter one";
+                  } elsif ($a == 2) {
+                     $str = "chapter two";
+                  } elsif ($a == 3) {
+                     $str = "chapter three";
+                  } elsif ($a == 4) {
+                     $str = "chapter four";
+                  } elsif ($a == 5) {
+                     $str = "chapter five";
+                  } elsif ($a == 6) {
+                     $str = "chapter six";
+                  } elsif ($a == 7) {
+                     $str = "chapter seven";
+                  } elsif ($a == 8) {
+                     $str = "chapter eight";
+                  } elsif ($a == 9) {
+                     $str = "chapter nine";
+                  } elsif ($a == 2) {
+                     $str = "chapter ten";
+                  }
+               } else {
+                  $str = "chapter " . $str;
+               }
+            } else {
+               $str = "section " . $str     if ($b != 0 && $c == 0);            
+               $str = "sub-section " . $str if ($b != 0 && $c != 0);
+            }
+            
+            #substitute
+            $_ =~ s/~\Q$word\E~/$str/;
+            
+            print "Found replacement tag for marker \"$word\" on line $srcline which refers to $str\n";
+         }
+         
+         # remake rest of the line
+         $cnt = @m;
+         $txt = "";
+         for ($i = 2; $i < $cnt; $i++) {
+             $txt = $txt . $m[$i] . "~";
+         }
+      }
+      print OUT $_;
+      ++$wroteline;
+   } elsif ($_ =~ m/FIGU/) {
+      # FIGU,file,caption
+      chomp($_);
+      @m = split(",", $_);
+      print OUT "\\begin{center}\n\\begin{figure}[here]\n\\includegraphics{pics/$m[1]$graph}\n";
+      print OUT "\\caption{$m[2]}\n\\end{figure}\n\\end{center}\n";
+      $wroteline += 4;
+   } else {
+      print OUT $_;
+      ++$wroteline;
+   }
+}
+print "Read $readline lines, wrote $wroteline lines\n";
+
+close (OUT);
+close (IN);
diff --git a/changes.txt b/changes.txt
index 6833bdc..997774e 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,37 @@
+May 17th, 2003
+v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
+          as several smaller suggestions.  Thanks!
+       -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-)
+       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
+       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
+       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
+       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the 
+          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed) 
+       -- Refactored the DR reduction code so there is only one function per file.
+       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
+          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
+       -- Fixed similar bug in the Montgomery reduction code.
+       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. 
+          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
+       -- Added new comments here and there, cleaned up some code [style stuff]
+       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
+       -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed.  This
+          saves quite a few calls and if statements.
+       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
+       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
+          uses RDTSC for high precision timing.  
+       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its 
+          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
+       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files 
+          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the 
+          root of the package.
+       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].  
+       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
+       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
+          shifting caused a carry.  
+       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
+          slower than it had to be.
+          
 Mar 29th, 2003
 v0.16  -- Sped up mp_div by making normalization one shift call
        -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
diff --git a/demo/demo.c b/demo/demo.c
index ff85903..ab8794d 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -1,21 +1,6 @@
 #include <time.h>
 
-
-#ifdef U_MPI
-#include <stdio.h>
-#include <string.h>
-#include <stdlib.h>
-#include <ctype.h>
-#include <limits.h>
-   #include "mpi.h"
-   #ifdef _MSC_VER
-      typedef __int64            ulong64;
-   #else
-      typedef unsigned long long ulong64;
-   #endif   
-#else   
-   #include "tommath.h"
-#endif
+#include "tommath.h"
 
 #ifdef TIMER
 ulong64 _tt;
@@ -23,19 +8,11 @@ void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
 #endif
 
-#ifndef DEBUG
-int _ifuncs;
-#else
-extern int _ifuncs;
-extern void dump_timings(void);
-extern void reset_timings(void);
-#endif
-   
 void ndraw(mp_int *a, char *name)
 {
    char buf[4096];
    printf("%s: ", name);
-   mp_toradix(a, buf, 10);
+   mp_toradix(a, buf, 64);
    printf("%s\n", buf);
 }
 
@@ -56,31 +33,13 @@ int lbit(void)
       lfsr <<= 1;
       return 0;
    }
-}   
-     
-#ifdef U_MPI
-int mp_reduce_setup(mp_int *a, mp_int *b)
-{
-   int res;
-   
-   mp_set(a, 1);
-   if ((res = s_mp_lshd(a, b->used * 2)) != MP_OKAY) {
-      return res;
-   }
-   return mp_div(a, b, a, NULL);
 }
 
-int mp_rand(mp_int *a, int c)
-{
-   long z = abs(rand()) & 65535;
-   mp_set(a, z?z:1);
-   while (c--) {
-      s_mp_lshd(a, 1);
-      mp_add_d(a, abs(rand()), a);
-   }
-   return MP_OKAY;
-}
-#endif
+
+#define DO2(x) x; x;
+#define DO4(x) DO2(x); DO2(x);
+#define DO8(x) DO4(x); DO4(x);
+#define DO(x)  DO8(x); DO8(x);
 
    char cmd[4096], buf[4096];
 int main(void)
@@ -89,12 +48,12 @@ int main(void)
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
                  div2_n, mul2_n;
    unsigned rr;
-   int cnt, ix;
+   int cnt, ix, old_kara_m, old_kara_s;
 
 #ifdef TIMER
    int n;
    ulong64 tt;
-   FILE *log;
+   FILE *log, *logb;
 #endif
 
    mp_init(&a);
@@ -102,11 +61,11 @@ int main(void)
    mp_init(&c);
    mp_init(&d);
    mp_init(&e);
-   mp_init(&f);
-   
+   mp_init(&f);   
+
 /* test the DR reduction */
 #if 0
-   
+
    srand(time(NULL));
    for (cnt = 2; cnt < 32; cnt++) {
        printf("%d digit modulus\n", cnt);
@@ -117,89 +76,103 @@ int main(void)
        }
        a.used = cnt;
        mp_prime_next_prime(&a, 3);
-       
+
        mp_rand(&b, cnt - 1);
        mp_copy(&b, &c);
-   
+
       rr = 0;
       do {
          if (!(rr & 127)) { printf("%9lu\r", rr); fflush(stdout); }
          mp_sqr(&b, &b); mp_add_d(&b, 1, &b);
          mp_copy(&b, &c);
-      
+
          mp_mod(&b, &a, &b);
          mp_dr_reduce(&c, &a, (1<<DIGIT_BIT)-a.dp[0]);
-      
+
          if (mp_cmp(&b, &c) != MP_EQ) {
             printf("Failed on trial %lu\n", rr); exit(-1);
          }
-      } while (++rr < 1000000); 
+      } while (++rr < 1000000);
       printf("Passed DR test for %d digits\n", cnt);
    }
-#endif   
+#endif
 
 #ifdef TIMER
       printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
-goto sqrtime;      
 
-      log = fopen("add.log", "w");
-      for (cnt = 4; cnt <= 128; cnt += 4) {
+      log = fopen("logs/add.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
          mp_rand(&a, cnt);
          mp_rand(&b, cnt);
          reset();
-         for (rr = 0; rr < 10000000; rr++) {
-             mp_add(&a, &b, &c);
-         }
+         rr = 0;
+         do { 
+            DO(mp_add(&a,&b,&c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
          tt = rdtsc();
          printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
       }
       fclose(log);
- 
-      log = fopen("sub.log", "w");
-      for (cnt = 4; cnt <= 128; cnt += 4) {
+
+      log = fopen("logs/sub.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
          mp_rand(&a, cnt);
          mp_rand(&b, cnt);
          reset();
-         for (rr = 0; rr < 10000000; rr++) {
-             mp_sub(&a, &b, &c);
-         }
+         rr = 0;
+         do { 
+            DO(mp_sub(&a,&b,&c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
          tt = rdtsc();
          printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
       }
       fclose(log);
-      
 
-sqrtime:   
-   log = fopen("sqr.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      mp_rand(&a, cnt);
-      reset();
-      for (rr = 0; rr < 250000; rr++) {
-          mp_sqr(&a, &b);
-      }
-      tt = rdtsc();
-      printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
-   }
-   fclose(log);
-   
-   log = fopen("mult.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-      reset();
-      for (rr = 0; rr < 250000; rr++) {
-          mp_mul(&a, &b, &c);
-      }
-      tt = rdtsc();
-      printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
-   }
-   fclose(log);
+   /* do mult/square twice, first without karatsuba and second with */
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 2; ix++) {
+      printf("With%s Karatsuba\n", (ix==0)?"out":"");
+
+      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
+
+      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 16) {
+         mp_rand(&a, cnt);
+         reset();
+         rr = 0;
+         do {
+            DO(mp_sqr(&a, &b));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         tt = rdtsc();
+         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      }
+      fclose(log);
+
+      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 16) {
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         reset();
+         rr = 0;
+         do {
+            DO(mp_mul(&a, &b, &c));
+            rr += 16;
+         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         tt = rdtsc();
+         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      }
+      fclose(log);
+   }
 
-expttime:
    {
       char *primes[] = {
          /* DR moduli */
@@ -210,7 +183,7 @@ expttime:
          "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
          "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
          "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-         
+
          /* generic unrestricted moduli */
          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
          "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
@@ -219,9 +192,10 @@ expttime:
          "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
          "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
          "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL         
+         NULL
       };
-   log = fopen("expt.log", "w");
+   log = fopen("logs/expt.log", "w");
+   logb = fopen("logs/expt_dr.log", "w");
    for (n = 0; primes[n]; n++) {
       mp_read_radix(&a, primes[n], 10);
       mp_zero(&b);
@@ -234,9 +208,11 @@ expttime:
       mp_mod(&b, &c, &b);
       mp_set(&c, 3);
       reset();
-      for (rr = 0; rr < 50; rr++) {
-          mp_exptmod(&c, &b, &a, &d);
-      }
+      rr = 0;
+      do {
+         DO(mp_exptmod(&c, &b, &a, &d));
+         rr += 16;
+      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
       tt = rdtsc();
       mp_sub_d(&a, 1, &e);
       mp_sub(&e, &b, &b);
@@ -248,25 +224,28 @@ expttime:
          exit(0);
       }
       printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf((n < 7) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+   }
    }
-   }   
    fclose(log);
+   fclose(logb);
 
-   log = fopen("invmod.log", "w");
+   log = fopen("logs/invmod.log", "w");
    for (cnt = 4; cnt <= 128; cnt += 4) {
       mp_rand(&a, cnt);
       mp_rand(&b, cnt);
-      
+
       do {
          mp_add_d(&b, 1, &b);
          mp_gcd(&a, &b, &c);
       } while (mp_cmp_d(&c, 1) != MP_EQ);
-      
+
       reset();
-      for (rr = 0; rr < 10000; rr++) {
-          mp_invmod(&b, &a, &c);
-      }
+      rr = 0;
+      do {
+         DO(mp_invmod(&b, &a, &c));
+         rr += 16;
+      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
       tt = rdtsc();
       mp_mulmod(&b, &c, &a, &d);
       if (mp_cmp_d(&d, 1) != MP_EQ) {
@@ -274,18 +253,18 @@ expttime:
          return 0;
       }
       printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d,%9llu\n", cnt, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
+      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt);
    }
    fclose(log);
-   
+
    return 0;
-  
+
 #endif
 
-   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n = 
+   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
    sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = 0;
+
    for (;;) {
- 
        /* randomly clear and re-init one variable, this has the affect of triming the alloc space */
        switch (abs(rand()) % 7) {
            case 0:  mp_clear(&a); mp_init(&a); break;
@@ -296,17 +275,17 @@ expttime:
            case 5:  mp_clear(&f); mp_init(&f); break;
            case 6:  break; /* don't clear any */
        }
-   
-   
+
+
        printf("%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu/%7lu ", add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, expt_n, inv_n, div2_n, mul2_n);
        fgets(cmd, 4095, stdin);
        cmd[strlen(cmd)-1] = 0;
        printf("%s  ]\r",cmd); fflush(stdout);
-       if (!strcmp(cmd, "mul2d")) { ++mul2d_n; 
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
+       if (!strcmp(cmd, "mul2d")) { ++mul2d_n;
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
           fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+
           mp_mul_2d(&a, rr, &a);
           a.sign = b.sign;
           if (mp_cmp(&a, &b) != MP_EQ) {
@@ -315,11 +294,11 @@ expttime:
              draw(&b);
              return 0;
           }
-       } else if (!strcmp(cmd, "div2d")) { ++div2d_n; 
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
+       } else if (!strcmp(cmd, "div2d")) { ++div2d_n;
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
           fgets(buf, 4095, stdin); sscanf(buf, "%d", &rr);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+
           mp_div_2d(&a, rr, &a, &e);
           a.sign = b.sign;
           if (a.used == b.used && a.used == 0) { a.sign = b.sign = MP_ZPOS; }
@@ -330,19 +309,19 @@ expttime:
              return 0;
           }
        } else if (!strcmp(cmd, "add")) { ++add_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
           mp_copy(&a, &d);
           mp_add(&d, &b, &d);
           if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("add %lu failure!\n", add_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("add %lu failure!\n", add_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
              return 0;
           }
-          
+
           /* test the sign/unsigned storage functions */
-          
+
           rr = mp_signed_bin_size(&c);
           mp_to_signed_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
@@ -353,8 +332,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
              draw(&d);
              return 0;
           }
-                    
-          
+
+
           rr = mp_unsigned_bin_size(&c);
           mp_to_unsigned_bin(&c, (unsigned char *)cmd);
           memset(cmd+rr, rand()&255, sizeof(cmd)-rr);
@@ -367,90 +346,90 @@ draw(&a);draw(&b);draw(&c);draw(&d);
           }
 
        } else if (!strcmp(cmd, "sub")) { ++sub_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
           mp_copy(&a, &d);
           mp_sub(&d, &b, &d);
           if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("sub %lu failure!\n", sub_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("sub %lu failure!\n", sub_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
              return 0;
           }
        } else if (!strcmp(cmd, "mul")) { ++mul_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
           mp_copy(&a, &d);
           mp_mul(&d, &b, &d);
           if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("mul %lu failure!\n", mul_n); 
-draw(&a);draw(&b);draw(&c);draw(&d);             
+             printf("mul %lu failure!\n", mul_n);
+draw(&a);draw(&b);draw(&c);draw(&d);
              return 0;
           }
        } else if (!strcmp(cmd, "div")) { ++div_n;
-          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 10);
-          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 10);
-          
+          fgets(buf, 4095, stdin); mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&c, buf, 64);
+          fgets(buf, 4095, stdin); mp_read_radix(&d, buf, 64);
+
           mp_div(&a, &b, &e, &f);
           if (mp_cmp(&c, &e) != MP_EQ || mp_cmp(&d, &f) != MP_EQ) {
-             printf("div %lu failure!\n", div_n); 
+             printf("div %lu failure!\n", div_n);
 draw(&a);draw(&b);draw(&c);draw(&d); draw(&e); draw(&f);
              return 0;
           }
-          
+
        } else if (!strcmp(cmd, "sqr")) { ++sqr_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
           mp_copy(&a, &c);
           mp_sqr(&c, &c);
           if (mp_cmp(&b, &c) != MP_EQ) {
-             printf("sqr %lu failure!\n", sqr_n); 
+             printf("sqr %lu failure!\n", sqr_n);
 draw(&a);draw(&b);draw(&c);
              return 0;
           }
        } else if (!strcmp(cmd, "gcd")) { ++gcd_n;
-          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+          fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+          fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
           mp_copy(&a, &d);
           mp_gcd(&d, &b, &d);
           d.sign = c.sign;
           if (mp_cmp(&c, &d) != MP_EQ) {
-             printf("gcd %lu failure!\n", gcd_n); 
+             printf("gcd %lu failure!\n", gcd_n);
 draw(&a);draw(&b);draw(&c);draw(&d);
              return 0;
           }
        } else if (!strcmp(cmd, "lcm")) { ++lcm_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
              mp_copy(&a, &d);
              mp_lcm(&d, &b, &d);
              d.sign = c.sign;
              if (mp_cmp(&c, &d) != MP_EQ) {
-                printf("lcm %lu failure!\n", lcm_n); 
+                printf("lcm %lu failure!\n", lcm_n);
    draw(&a);draw(&b);draw(&c);draw(&d);
                 return 0;
              }
        } else if (!strcmp(cmd, "expt")) {  ++expt_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&d, buf, 64);
              mp_copy(&a, &e);
              mp_exptmod(&e, &b, &c, &e);
              if (mp_cmp(&d, &e) != MP_EQ) {
-                printf("expt %lu failure!\n", expt_n); 
+                printf("expt %lu failure!\n", expt_n);
    draw(&a);draw(&b);draw(&c);draw(&d); draw(&e);
                 return 0;
              }
        } else if (!strcmp(cmd, "invmod")) {  ++inv_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&c, buf, 64);
              mp_invmod(&a, &b, &d);
              mp_mulmod(&d,&a,&b,&e);
              if (mp_cmp_d(&e, 1) != MP_EQ) {
@@ -460,10 +439,10 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                 draw(&e);
                 return 0;
              }
-                
+
        } else if (!strcmp(cmd, "div2")) { ++div2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
              mp_div_2(&a, &c);
              if (mp_cmp(&c, &b) != MP_EQ) {
                  printf("div_2 %lu failure\n", div2_n);
@@ -473,8 +452,8 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                  return 0;
              }
        } else if (!strcmp(cmd, "mul2")) { ++mul2_n;
-             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 10);
-             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 10);
+             fgets(buf, 4095, stdin);  mp_read_radix(&a, buf, 64);
+             fgets(buf, 4095, stdin);  mp_read_radix(&b, buf, 64);
              mp_mul_2(&a, &c);
              if (mp_cmp(&c, &b) != MP_EQ) {
                  printf("mul_2 %lu failure\n", mul2_n);
@@ -483,9 +462,9 @@ draw(&a);draw(&b);draw(&c);draw(&d);
                  draw(&c);
                  return 0;
              }
-       }             
-       
+       }
+
    }
-   return 0;   
+   return 0;
 }
 
diff --git a/demo/test.c b/demo/test.c
new file mode 100644
index 0000000..e69de29
diff --git a/etc/makefile b/etc/makefile
index 261cd1c..dce98da 100644
--- a/etc/makefile
+++ b/etc/makefile
@@ -1,23 +1,40 @@
 CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
 
-
 # default lib name (requires install with root)
 # LIBNAME=-ltommath
 
 # libname when you can't install the lib with install
 LIBNAME=../libtommath.a
 
+#provable primes
 pprime: pprime.o
 	$(CC) pprime.o $(LIBNAME) -o pprime
 
+# portable [well requires clock()] tuning app
 tune: tune.o
 	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
         
+# spits out mersenne primes
 mersenne: mersenne.o
 	$(CC) mersenne.o $(LIBNAME) -o mersenne
 
+# fines DR safe primes for the given config
 drprime: drprime.o
 	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
         
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime
\ No newline at end of file
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont
\ No newline at end of file
diff --git a/etc/mont.c b/etc/mont.c
new file mode 100644
index 0000000..af6fd7a
--- /dev/null
+++ b/etc/mont.c
@@ -0,0 +1,45 @@
+/* tests the montgomery routines */
+#include <tommath.h>
+
+int main(void)
+{
+   mp_int modulus, R, p, pp;
+   mp_digit mp;
+   long x, y;
+
+   mp_init_multi(&modulus, &R, &p, &pp, NULL);
+
+   /* loop through various sizes */
+   for (x = 4; x < 128; x++) {
+       printf("DIGITS == %3ld...", x); fflush(stdout);
+       
+       /* make up the odd modulus */
+       mp_rand(&modulus, x);
+       modulus.dp[0] |= 1;
+       
+       /* now find the R value */
+       mp_montgomery_calc_normalization(&R, &modulus);
+       mp_montgomery_setup(&modulus, &mp);
+       
+       /* now run through a bunch tests */
+       for (y = 0; y < 100000; y++) {
+           mp_rand(&p, x/2);        /* p = random */
+           mp_mul(&p, &R, &pp);     /* pp = R * p */
+           mp_montgomery_reduce(&pp, &modulus, mp);
+           
+           /* should be equal to p */
+           if (mp_cmp(&pp, &p) != MP_EQ) {
+              printf("FAILURE!\n");
+              exit(-1);
+           }
+       }
+       printf("PASSED\n");
+    }
+    
+    return 0;
+}
+
+
+
+
+
diff --git a/etc/timer.asm b/etc/timer.asm
new file mode 100644
index 0000000..35890d9
--- /dev/null
+++ b/etc/timer.asm
@@ -0,0 +1,37 @@
+; x86 timer in NASM
+;
+; Tom St Denis, tomstdenis@iahu.ca
+[bits 32]
+[section .data]
+time dd 0, 0
+
+[section .text]
+
+%ifdef USE_ELF
+[global t_start]
+t_start:
+%else
+[global _t_start]
+_t_start:
+%endif
+   push edx
+   push eax
+   rdtsc
+   mov [time+0],edx
+   mov [time+4],eax
+   pop eax
+   pop edx
+   ret
+   
+%ifdef USE_ELF
+[global t_read]
+t_read:
+%else
+[global _t_read]
+_t_read:
+%endif
+   rdtsc
+   sub eax,[time+4]
+   sbb edx,[time+0]
+   ret
+   
\ No newline at end of file
diff --git a/etc/tune.c b/etc/tune.c
index 0346677..5648496 100644
--- a/etc/tune.c
+++ b/etc/tune.c
@@ -5,10 +5,21 @@
 #include <tommath.h>
 #include <time.h>
 
-clock_t
+#ifndef X86_TIMER
+
+/* generic ISO C timer */
+unsigned long long __T;
+void t_start(void) { __T = clock(); }
+unsigned long long t_read(void) { return clock() - __T; }
+
+#else
+extern void t_start(void);
+extern unsigned long long t_read(void);
+#endif
+
+unsigned long long
 time_mult (void)
 {
-  clock_t t1;
   int     x, y;
   mp_int  a, b, c;
 
@@ -16,137 +27,83 @@ time_mult (void)
   mp_init (&b);
   mp_init (&c);
 
-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
+  t_start();
+  for (x = 32; x <= 288; x += 4) {
     mp_rand (&a, x);
     mp_rand (&b, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 100; y++) {
       mp_mul (&a, &b, &c);
     }
   }
   mp_clear (&a);
   mp_clear (&b);
   mp_clear (&c);
-  return clock () - t1;
+  return t_read();
 }
 
-clock_t
+unsigned long long
 time_sqr (void)
 {
-  clock_t t1;
   int     x, y;
   mp_int  a, b;
 
   mp_init (&a);
   mp_init (&b);
 
-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
+  t_start();
+  for (x = 32; x <= 288; x += 4) {
     mp_rand (&a, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 100; y++) {
       mp_sqr (&a, &b);
     }
   }
   mp_clear (&a);
   mp_clear (&b);
-  return clock () - t1;
-}
-
-clock_t
-time_expt (void)
-{
-  clock_t t1;
-  int     x, y;
-  mp_int  a, b, c, d;
-
-  mp_init (&a);
-  mp_init (&b);
-  mp_init (&c);
-  mp_init (&d);
-
-  t1 = clock ();
-  for (x = 4; x <= 144; x += 4) {
-    mp_rand (&a, x);
-    mp_rand (&b, x);
-    mp_rand (&c, x);
-    if (mp_iseven (&c) != 0) {
-      mp_add_d (&c, 1, &c);
-    }
-    for (y = 0; y < 10; y++) {
-      mp_exptmod (&a, &b, &c, &d);
-    }
-  }
-  mp_clear (&d);
-  mp_clear (&c);
-  mp_clear (&b);
-  mp_clear (&a);
-
-  return clock () - t1;
+  return t_read();
 }
 
 int
 main (void)
 {
-  int     best_mult, best_square, best_exptmod;
-  clock_t best, ti;
+  int     best_mult, best_square;
+  unsigned long long best, ti;
   FILE   *log;
 
-  best_mult = best_square = best_exptmod = 0;
-
+  best_mult = best_square = 0;
   /* tune multiplication first */
   log = fopen ("mult.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 144; KARATSUBA_MUL_CUTOFF++) {
+  best = -1;
+  for (KARATSUBA_MUL_CUTOFF = 8; KARATSUBA_MUL_CUTOFF <= 200; KARATSUBA_MUL_CUTOFF++) {
     ti = time_mult ();
-    printf ("%4d : %9lu\r", KARATSUBA_MUL_CUTOFF, ti);
-    fprintf (log, "%d, %lu\n", KARATSUBA_MUL_CUTOFF, ti);
+    printf ("%4d : %9llu\r", KARATSUBA_MUL_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", KARATSUBA_MUL_CUTOFF, ti);
     fflush (stdout);
     if (ti < best) {
-      printf ("New best: %lu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
+      printf ("New best: %llu, %d         \n", ti, KARATSUBA_MUL_CUTOFF);
       best = ti;
       best_mult = KARATSUBA_MUL_CUTOFF;
     }
   }
   fclose (log);
-
   /* tune squaring */
   log = fopen ("sqr.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 144; KARATSUBA_SQR_CUTOFF++) {
+  best = -1;
+  for (KARATSUBA_SQR_CUTOFF = 8; KARATSUBA_SQR_CUTOFF <= 200; KARATSUBA_SQR_CUTOFF++) {
     ti = time_sqr ();
-    printf ("%4d : %9lu\r", KARATSUBA_SQR_CUTOFF, ti);
-    fprintf (log, "%d, %lu\n", KARATSUBA_SQR_CUTOFF, ti);
+    printf ("%4d : %9llu\r", KARATSUBA_SQR_CUTOFF, ti);
+    fprintf (log, "%d, %llu\n", KARATSUBA_SQR_CUTOFF, ti);
     fflush (stdout);
     if (ti < best) {
-      printf ("New best: %lu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
+      printf ("New best: %llu, %d         \n", ti, KARATSUBA_SQR_CUTOFF);
       best = ti;
       best_square = KARATSUBA_SQR_CUTOFF;
     }
   }
   fclose (log);
 
-  /* tune exptmod */
-  KARATSUBA_MUL_CUTOFF = best_mult;
-  KARATSUBA_SQR_CUTOFF = best_square;
-
-  log = fopen ("expt.log", "w");
-  best = CLOCKS_PER_SEC * 1000;
-  for (MONTGOMERY_EXPT_CUTOFF = 8; MONTGOMERY_EXPT_CUTOFF <= 144; MONTGOMERY_EXPT_CUTOFF++) {
-    ti = time_expt ();
-    printf ("%4d : %9lu\r", MONTGOMERY_EXPT_CUTOFF, ti);
-    fflush (stdout);
-    fprintf (log, "%d : %lu\r", MONTGOMERY_EXPT_CUTOFF, ti);
-    if (ti < best) {
-      printf ("New best: %lu, %d\n", ti, MONTGOMERY_EXPT_CUTOFF);
-      best = ti;
-      best_exptmod = MONTGOMERY_EXPT_CUTOFF;
-    }
-  }
-  fclose (log);
-
   printf
-    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\nMontgomery exptmod Cutoff: %d\n",
-     best_mult, best_square, best_exptmod);
+    ("\n\n\nKaratsuba Multiplier Cutoff: %d\nKaratsuba Squaring Cutoff: %d\n",
+     best_mult, best_square);
 
   return 0;
 }
diff --git a/gen.pl b/gen.pl
index fcfd57d..e6009d9 100644
--- a/gen.pl
+++ b/gen.pl
@@ -1,27 +1,18 @@
-#!/usr/bin/perl
+#!/usr/bin/perl -w
 #
-#Generates a "single file" you can use to quickly add the whole source 
-#without any makefile troubles
+# Generates a "single file" you can use to quickly
+# add the whole source without any makefile troubles
 #
+use strict;
 
-opendir(DIR,".");
-@files = readdir(DIR);
-closedir(DIR);
-
-open(OUT,">mpi.c");
-print OUT "/* File Generated Automatically by gen.pl */\n\n";
-for (@files) {
-   if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
-      $fname = $_;
-      open(SRC,"<$fname");
-      print OUT "/* Start: $fname */\n";
-      while (<SRC>) {
-         print OUT $_;
-      }
-      close(SRC);
-      print OUT "\n/* End: $fname */\n\n";
-   }
+open( OUT, ">mpi.c" ) or die "Couldn't open mpi.c for writing: $!";
+foreach my $filename (glob "bn_*.c") {
+   open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+   print OUT "/* Start: $filename */\n";
+   print OUT qq[#line 0 "$filename"\n];
+   print OUT while <SRC>;
+   print OUT "\n/* End: $filename */\n\n";
+   close SRC or die "Error closing $filename after reading: $!";
 }
-print OUT "\n/* EOF */\n";
-close(OUT);
-   
\ No newline at end of file
+print OUT "\b/* EOF */\n";
+close OUT or die "Error closing mpi.c after writing: $!";
\ No newline at end of file
diff --git a/logs/README b/logs/README
new file mode 100644
index 0000000..ea20c81
--- /dev/null
+++ b/logs/README
@@ -0,0 +1,13 @@
+To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
+Todo this type 
+
+make timing ; ltmtest
+
+in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
+
+After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
+them all :-)
+
+Have fun
+
+Tom
\ No newline at end of file
diff --git a/logs/add.log b/logs/add.log
new file mode 100644
index 0000000..1e144e8
--- /dev/null
+++ b/logs/add.log
@@ -0,0 +1,16 @@
+224  11039864
+448   9206336
+672   8178200
+896   7432176
+1120   6433264
+1344   5847056
+1568   5270184
+1792   4943416
+2016   4520016
+2240   4256168
+2464   3999224
+2688   3714896
+2912   3572720
+3136   3340176
+3360   3222584
+3584   3036336
diff --git a/logs/addsub.png b/logs/addsub.png
new file mode 100644
index 0000000..1113ed3
Binary files /dev/null and b/logs/addsub.png differ
diff --git a/logs/expt.log b/logs/expt.log
new file mode 100644
index 0000000..fb0b718
--- /dev/null
+++ b/logs/expt.log
@@ -0,0 +1,7 @@
+14364       666
+21532       253
+28700       117
+57372        17
+71708         9
+86044         5
+114716         2
diff --git a/logs/expt.png b/logs/expt.png
new file mode 100644
index 0000000..b534a9b
Binary files /dev/null and b/logs/expt.png differ
diff --git a/logs/expt_dr.log b/logs/expt_dr.log
new file mode 100644
index 0000000..f80a9ee
--- /dev/null
+++ b/logs/expt_dr.log
@@ -0,0 +1,7 @@
+14896      1088
+21952       468
+29008       244
+43120        91
+58016        43
+86240        15
+115248         6
diff --git a/logs/graphs.dem b/logs/graphs.dem
new file mode 100644
index 0000000..4441c0d
--- /dev/null
+++ b/logs/graphs.dem
@@ -0,0 +1,17 @@
+set terminal png color
+set size 1.5
+set ylabel "Operations per Second"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
diff --git a/logs/index.html b/logs/index.html
new file mode 100644
index 0000000..f3a5562
--- /dev/null
+++ b/logs/index.html
@@ -0,0 +1,24 @@
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/logs/invmod.log b/logs/invmod.log
new file mode 100644
index 0000000..e84ba9f
--- /dev/null
+++ b/logs/invmod.log
@@ -0,0 +1,32 @@
+112     15608
+224      7840
+336      5104
+448      3376
+560      2616
+672      1984
+784      1640
+896      2056
+1008      1136
+1120       936
+1232      1240
+1344      1112
+1456       608
+1568       873
+1680       492
+1792       444
+1904       640
+2016       584
+2128       328
+2240       307
+2352       283
+2464       256
+2576       393
+2688       365
+2800       344
+2912       196
+3024       301
+3136       170
+3248       160
+3360       250
+3472       144
+3584       224
diff --git a/logs/invmod.png b/logs/invmod.png
new file mode 100644
index 0000000..a38bfd5
Binary files /dev/null and b/logs/invmod.png differ
diff --git a/logs/mult.log b/logs/mult.log
new file mode 100644
index 0000000..835dc52
--- /dev/null
+++ b/logs/mult.log
@@ -0,0 +1,17 @@
+896    321504
+1344    150784
+1792     90288
+2240     59760
+2688     42480
+3136     32056
+3584     24600
+4032     19656
+4480     16024
+4928     13328
+5376     11280
+5824      9624
+6272      8336
+6720      7280
+7168      1648
+7616      1464
+8064      1296
diff --git a/logs/mult.png b/logs/mult.png
new file mode 100644
index 0000000..c49a434
Binary files /dev/null and b/logs/mult.png differ
diff --git a/logs/mult_kara.log b/logs/mult_kara.log
new file mode 100644
index 0000000..0babf2e
--- /dev/null
+++ b/logs/mult_kara.log
@@ -0,0 +1,17 @@
+896    321928
+1344    150752
+1792     90136
+2240     59888
+2688     42480
+3136     32080
+3584     25744
+4032     21216
+4480     17912
+4928     14896
+5376     12936
+5824     11216
+6272      9848
+6720      8896
+7168      7968
+7616      7248
+8064      6600
diff --git a/logs/sqr.log b/logs/sqr.log
new file mode 100644
index 0000000..2ed78eb
--- /dev/null
+++ b/logs/sqr.log
@@ -0,0 +1,17 @@
+896    416968
+1344    223672
+1792    141552
+2240     97280
+2688     71304
+3136     54648
+3584     16264
+4032     13000
+4480     10528
+4928      8776
+5376      7464
+5824      6440
+6272      5520
+6720      4808
+7168      4264
+7616      3784
+8064      3368
diff --git a/logs/sqr_kara.log b/logs/sqr_kara.log
new file mode 100644
index 0000000..b890211
--- /dev/null
+++ b/logs/sqr_kara.log
@@ -0,0 +1,17 @@
+896    416656
+1344    223728
+1792    141288
+2240     97456
+2688     71152
+3136     54392
+3584     38552
+4032     32216
+4480     27384
+4928     23792
+5376     20728
+5824     18232
+6272     16160
+6720     14408
+7168     11696
+7616     10768
+8064      9920
diff --git a/logs/sub.log b/logs/sub.log
new file mode 100644
index 0000000..14c519d
--- /dev/null
+++ b/logs/sub.log
@@ -0,0 +1,16 @@
+224   9862520
+448   8562344
+672   7661400
+896   6838128
+1120   5911144
+1344   5394040
+1568   4993760
+1792   4624240
+2016   4332024
+2240   4029312
+2464   3790784
+2688   3587216
+2912   3397952
+3136   3239736
+3360   3080616
+3584   2933104
diff --git a/makefile b/makefile
index 8466163..4f5a627 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.16
+VERSION=0.17
 
 default: libtommath.a
 
@@ -32,7 +32,8 @@ bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_un
 bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
-bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o 
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o bn_mp_multi.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o
 
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@@ -52,21 +53,46 @@ test: libtommath.a demo/demo.o
         
 timing: libtommath.a
 	$(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s
-	$(CC) $(CFLAGS) -DTIMER -DU_MPI -I./mtest/ demo/demo.c mtest/mpi.c -o mpitest -s
 
-docdvi: bn.tex
-	latex bn
+# makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
+docdvi: tommath.src
+	cd pics ; make 
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+		
+# makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
+docs:	
+	cd pics ; make pdfes
+	echo "hello" > tommath.ind
+	perl booker.pl
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+	dvips -tB5 -D600 tommath
+	echo "hello" > tommath.ind
+	perl booker.pl PDF
+	latex tommath > /dev/null
+	makeindex tommath
+	latex tommath > /dev/null
+	pdflatex tommath
+	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
 	
-docs:	docdvi
+#the old manual being phased out
+manual:	
+	latex bn
 	pdflatex bn
-	rm -f bn.log bn.aux bn.dvi
+	rm -f bn.aux bn.dvi bn.log 	
 	
 clean:
 	rm -f *.pdf *.o *.a *.obj *.lib *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        bn.log bn.aux bn.dvi *.log *.s mpi.c 
+        tommath.idx tommath.toc tommath.log tommath.aux tommath.dvi tommath.lof tommath.ind tommath.ilg *.ps *.pdf *.log *.s mpi.c 
 	cd etc ; make clean
+	cd pics ; make clean
 
-zipup: clean docs
+zipup: clean manual
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
diff --git a/makefile.msvc b/makefile.msvc
index 4daf310..dcc14b1 100644
--- a/makefile.msvc
+++ b/makefile.msvc
@@ -22,7 +22,8 @@ bn_mp_count_bits.obj bn_mp_read_unsigned_bin.obj bn_mp_read_signed_bin.obj bn_mp
 bn_mp_to_signed_bin.obj bn_mp_unsigned_bin_size.obj bn_mp_signed_bin_size.obj bn_radix.obj \
 bn_mp_xor.obj bn_mp_and.obj bn_mp_or.obj bn_mp_rand.obj bn_mp_montgomery_calc_normalization.obj \
 bn_mp_prime_is_divisible.obj bn_prime_tab.obj bn_mp_prime_fermat.obj bn_mp_prime_miller_rabin.obj \
-bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj
+bn_mp_prime_is_prime.obj bn_mp_prime_next_prime.obj bn_mp_dr_reduce.obj bn_mp_multi.obj \
+bn_mp_dr_is_modulus.obj bn_mp_dr_setup.obj
 
 
 library: $(OBJECTS)
diff --git a/mtest/mtest.c b/mtest/mtest.c
index fe02906..086e7bc 100644
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@@ -10,7 +10,7 @@ result1
 result2
 [... resultN]
 
-So for example "a * b mod n" would be 
+So for example "a * b mod n" would be
 
 mulmod
 a
@@ -18,7 +18,7 @@ b
 n
 a*b mod n
 
-e.g. if a=3, b=4 n=11 then 
+e.g. if a=3, b=4 n=11 then
 
 mulmod
 3
@@ -38,10 +38,10 @@ FILE *rng;
 void rand_num(mp_int *a)
 {
    int n, size;
-   unsigned char buf[512];
+   unsigned char buf[2048];
 
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 1024);
    buf[0] = (fgetc(rng)&1)?1:0;
    fread(buf+1, 1, size, rng);
    for (n = 0; n < size; n++) {
@@ -54,7 +54,7 @@ top:
 void rand_num2(mp_int *a)
 {
    int n, size;
-   unsigned char buf[512];
+   unsigned char buf[2048];
 
 top:
    size = 1 + ((fgetc(rng)*fgetc(rng)) % 96);
@@ -67,18 +67,38 @@ top:
    mp_read_raw(a, buf, 1+size);
 }
 
+#define mp_to64(a, b) mp_toradix(a, b, 64)
+
 int main(void)
 {
    int n;
    mp_int a, b, c, d, e;
    char buf[4096];
-   
+
    mp_init(&a);
    mp_init(&b);
    mp_init(&c);
    mp_init(&d);
    mp_init(&e);
 
+
+   /* initial (2^n - 1)^2 testing, makes sure the comba multiplier works [it has the new carry code] */
+/*
+   mp_set(&a, 1);
+   for (n = 1; n < 8192; n++) {
+       mp_mul(&a, &a, &c);
+       printf("mul\n");
+       mp_to64(&a, buf);
+       printf("%s\n%s\n", buf, buf);
+       mp_to64(&c, buf);
+       printf("%s\n", buf);
+
+       mp_add_d(&a, 1, &a);
+       mp_mul_2(&a, &a);
+       mp_sub_d(&a, 1, &a);
+   }
+*/
+
    rng = fopen("/dev/urandom", "rb");
    if (rng == NULL) {
       rng = fopen("/dev/random", "rb");
@@ -97,11 +117,11 @@ int main(void)
        rand_num(&b);
        mp_add(&a, &b, &c);
        printf("add\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
        printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
        printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
        printf("%s\n", buf);
    } else if (n == 1) {
       /* sub tests */
@@ -109,11 +129,11 @@ int main(void)
        rand_num(&b);
        mp_sub(&a, &b, &c);
        printf("sub\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
        printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
        printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
        printf("%s\n", buf);
    } else if (n == 2) {
        /* mul tests */
@@ -121,11 +141,11 @@ int main(void)
        rand_num(&b);
        mp_mul(&a, &b, &c);
        printf("mul\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
        printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
        printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
        printf("%s\n", buf);
    } else if (n == 3) {
       /* div tests */
@@ -133,22 +153,22 @@ int main(void)
        rand_num(&b);
        mp_div(&a, &b, &c, &d);
        printf("div\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
        printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
        printf("%s\n", buf);
-       mp_todecimal(&c, buf);
+       mp_to64(&c, buf);
        printf("%s\n", buf);
-       mp_todecimal(&d, buf);
+       mp_to64(&d, buf);
        printf("%s\n", buf);
    } else if (n == 4) {
       /* sqr tests */
        rand_num(&a);
        mp_sqr(&a, &b);
        printf("sqr\n");
-       mp_todecimal(&a, buf);
+       mp_to64(&a, buf);
        printf("%s\n", buf);
-       mp_todecimal(&b, buf);
+       mp_to64(&b, buf);
        printf("%s\n", buf);
    } else if (n == 5) {
       /* mul_2d test */
@@ -156,11 +176,11 @@ int main(void)
       mp_copy(&a, &b);
       n = fgetc(rng) & 63;
       mp_mul_2d(&b, n, &b);
-      mp_todecimal(&a, buf);
+      mp_to64(&a, buf);
       printf("mul2d\n");
       printf("%s\n", buf);
       printf("%d\n", n);
-      mp_todecimal(&b, buf);
+      mp_to64(&b, buf);
       printf("%s\n", buf);
    } else if (n == 6) {
       /* div_2d test */
@@ -168,11 +188,11 @@ int main(void)
       mp_copy(&a, &b);
       n = fgetc(rng) & 63;
       mp_div_2d(&b, n, &b, NULL);
-      mp_todecimal(&a, buf);
+      mp_to64(&a, buf);
       printf("div2d\n");
       printf("%s\n", buf);
       printf("%d\n", n);
-      mp_todecimal(&b, buf);
+      mp_to64(&b, buf);
       printf("%s\n", buf);
    } else if (n == 7) {
       /* gcd test */
@@ -182,12 +202,12 @@ int main(void)
       b.sign = MP_ZPOS;
       mp_gcd(&a, &b, &c);
       printf("gcd\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
    } else if (n == 8) {
       /* lcm test */
       rand_num(&a);
@@ -196,12 +216,12 @@ int main(void)
       b.sign = MP_ZPOS;
       mp_lcm(&a, &b, &c);
       printf("lcm\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
    } else if (n == 9) {
       /* exptmod test */
       rand_num2(&a);
@@ -210,14 +230,14 @@ int main(void)
       a.sign = b.sign = c.sign = 0;
       mp_exptmod(&a, &b, &c, &d);
       printf("expt\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&d, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
+      mp_to64(&d, buf);
+      printf("%s\n", buf);
    } else if (n == 10) {
       /* invmod test */
       rand_num2(&a);
@@ -229,28 +249,28 @@ int main(void)
       if (mp_cmp_d(&b, 1) == 0) continue;
       mp_invmod(&a, &b, &c);
       printf("invmod\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&c, buf);
-      printf("%s\n", buf);      
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
+      printf("%s\n", buf);
+      mp_to64(&c, buf);
+      printf("%s\n", buf);
    } else if (n == 11) {
       rand_num(&a);
       mp_mul_2(&a, &a);
       mp_div_2(&a, &b);
       printf("div2\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
       printf("%s\n", buf);
    } else if (n == 12) {
       rand_num2(&a);
       mp_mul_2(&a, &b);
       printf("mul2\n");
-      mp_todecimal(&a, buf);
-      printf("%s\n", buf);      
-      mp_todecimal(&b, buf);
+      mp_to64(&a, buf);
+      printf("%s\n", buf);
+      mp_to64(&b, buf);
       printf("%s\n", buf);
    }
    }
diff --git a/pics/makefile b/pics/makefile
new file mode 100644
index 0000000..4be4899
--- /dev/null
+++ b/pics/makefile
@@ -0,0 +1,17 @@
+# makes the images... yeah
+
+default:  pses
+
+
+sliding_window.ps: sliding_window.tif
+	tiff2ps -c -e sliding_window.tif > sliding_window.ps
+
+sliding_window.pdf: sliding_window.ps
+	epstopdf sliding_window.ps
+
+pses: sliding_window.ps 
+pdfes: sliding_window.pdf
+
+clean:
+	rm -rf *.ps *.pdf .xvpics
+   
\ No newline at end of file
diff --git a/pics/sliding_window.TIF b/pics/sliding_window.TIF
new file mode 100644
index 0000000..bb4cb96
Binary files /dev/null and b/pics/sliding_window.TIF differ
diff --git a/pics/sliding_window.sxd b/pics/sliding_window.sxd
new file mode 100644
index 0000000..91e7c0d
Binary files /dev/null and b/pics/sliding_window.sxd differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index 3921dc4..bd6f2ce 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -1,6051 +1,6356 @@
-/* File Generated Automatically by gen.pl */
-
-/* Start: bncore.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* configured for a AMD Duron Morgan core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 73,	/* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 121,	/* Min. number of digits before Karatsuba squaring is used. */
-        MONTGOMERY_EXPT_CUTOFF = 128;	/* max. number of digits that montgomery reductions will help for */
-
-/* End: bncore.c */
-
-/* Start: bn_fast_mp_invmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes the modular inverse via binary extended euclidean algorithm, 
- * that is c = 1/a mod b 
- *
- * Based on mp_invmod except this is optimized for the case where b is 
- * odd as per HAC Note 14.64 on pp. 610
- */
-int
-fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, B, D;
-  int     res, neg;
-
-  /* init all our temps */
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __B;
-  }
-
-  /* x == modulus, y == value to invert */
-  if ((res = mp_copy (b, &x)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (a, &y)) != MP_OKAY) {
-    goto __D;
-  }
-
-  /* we need |y| */
-  if ((res = mp_abs (&y, &y)) != MP_OKAY) {
-    goto __D;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! 
-   * 
-   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
-   */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto __D;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
-  }
-  mp_set (&D, 1);
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&B) == 0) {
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
-      }
-    }
-    /* B = B/2 */
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
-    }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&D) == 0) {
-      /* D = (D-x)/2 */
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
-      }
-    }
-    /* D = D/2 */
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
-    }
-  } else {
-    /* v - v - u, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0) {
-    goto top;
-  }
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto __D;
-  }
-
-  /* b is now the inverse */
-  neg = a->sign;
-  while (D.sign == MP_NEG) {
-    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-  mp_exch (&D, c);
-  c->sign = neg;
-  res = MP_OKAY;
-
-__D:mp_clear (&D);
-__B:mp_clear (&B);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
-  return res;
-}
-
-/* End: bn_fast_mp_invmod.c */
-
-/* Start: bn_fast_mp_montgomery_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes xR^-1 == x (mod N) via Montgomery Reduction 
- * 
- * This is an optimized implementation of mp_montgomery_reduce 
- * which uses the comba method to quickly calculate the columns of the
- * reduction.  
- *
- * Based on Algorithm 14.32 on pp.601 of HAC.
-*/
-int
-fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
-{
-  int     ix, res, olduse;
-  mp_word W[512];
-
-  /* get old used count */
-  olduse = a->used;
-
-  /* grow a as required */
-  if (a->alloc < m->used + 1) {
-    if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  {
-    register mp_word *_W;
-    register mp_digit *tmpa;
-
-    _W = W;
-    tmpa = a->dp;
-
-    /* copy the digits of a into W[0..a->used-1] */
-    for (ix = 0; ix < a->used; ix++) {
-      *_W++ = *tmpa++;
-    }
-
-    /* zero the high words of W[a->used..m->used*2] */
-    for (; ix < m->used * 2 + 1; ix++) {
-      *_W++ = 0;
-    }
-  }
-
-  for (ix = 0; ix < m->used; ix++) {
-    /* ui = ai * m' mod b
-     *
-     * We avoid a double precision multiplication (which isn't required)
-     * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
-     * the carry cleared (see after the inner loop)
-     */
-    register mp_digit ui;
-    ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK;
-
-    /* a = a + ui * m * b^i
-     *
-     * This is computed in place and on the fly.  The multiplication
-     * by b^i is handled by offseting which columns the results
-     * are added to.
-     *
-     * Note the comba method normally doesn't handle carries in the inner loop
-     * In this case we fix the carry from the previous column since the Montgomery
-     * reduction requires digits of the result (so far) [see above] to work.  This is
-     * handled by fixing up one carry after the inner loop.  The carry fixups are done
-     * in order so after these loops the first m->used words of W[] have the carries
-     * fixed
-     */
-    {
-      register int iy;
-      register mp_digit *tmpx;
-      register mp_word *_W;
-
-      /* alias for the digits of the modulus */
-      tmpx = m->dp;
-
-      /* Alias for the columns set by an offset of ix */
-      _W = W + ix;
-
-      /* inner loop */
-      for (iy = 0; iy < m->used; iy++) {
-	*_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
-      }
-    }
-
-    /* now fix carry for next digit, W[ix+1] */
-    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-  }
-
-
-  {
-    register mp_digit *tmpa;
-    register mp_word *_W, *_W1;
-
-    /* nox fix rest of carries */
-    _W1 = W + ix;
-    _W = W + ++ix;
-
-    for (; ix <= m->used * 2 + 1; ix++) {
-      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-    }
-
-    /* copy out, A = A/b^n
-     *
-     * The result is A/b^n but instead of converting from an array of mp_word
-     * to mp_digit than calling mp_rshd we just copy them in the right
-     * order
-     */
-    tmpa = a->dp;
-    _W = W + m->used;
-
-    for (ix = 0; ix < m->used + 1; ix++) {
-      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
-    }
-
-    /* zero oldused digits, if the input a was larger than
-     * m->used+1 we'll have to clear the digits */
-    for (; ix < olduse; ix++) {
-      *tmpa++ = 0;
-    }
-  }
-
-  /* set the max used and clamp */
-  a->used = m->used + 1;
-  mp_clamp (a);
-
-  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
-    return s_mp_sub (a, m, a);
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_fast_mp_montgomery_reduce.c */
-
-/* Start: bn_fast_s_mp_mul_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Fast (comba) multiplier
- *
- * This is the fast column-array [comba] multiplier.  It is designed to compute
- * the columns of the product first then handle the carries afterwards.  This
- * has the effect of making the nested loops that compute the columns very
- * simple and schedulable on super-scalar processors.
- *
- * This has been modified to produce a variable number of digits of output so
- * if say only a half-product is required you don't have to compute the upper half
- * (a feature required for fast Barrett reduction).
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- *
- */
-int
-fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     olduse, res, pa, ix;
-  mp_word W[512];
-
-  /* grow the destination as required */
-  if (c->alloc < digs) {
-    if ((res = mp_grow (c, digs)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* clear temp buf (the columns) */
-  memset (W, 0, sizeof (mp_word) * digs);
-
-  /* calculate the columns */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-
-    /* this multiplier has been modified to allow you to control how many digits 
-     * of output are produced.  So at most we want to make upto "digs" digits
-     * of output.
-     *
-     * this adds products to distinct columns (at ix+iy) of W
-     * note that each step through the loop is not dependent on
-     * the previous which means the compiler can easily unroll
-     * the loop without scheduling problems
-     */
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy, pb;
-
-      /* alias for the the word on the left e.g. A[ix] * A[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for the right side */
-      tmpy = b->dp;
-
-      /* alias for the columns, each step through the loop adds a new
-         term to each column
-       */
-      _W = W + ix;
-
-      /* the number of digits is limited by their placement.  E.g. 
-         we avoid multiplying digits that will end up above the # of
-         digits of precision requested
-       */
-      pb = MIN (b->used, digs - ix);
-
-      for (iy = 0; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-
-  }
-
-  /* setup dest */
-  olduse = c->used;
-  c->used = digs;
-
-  {
-    register mp_digit *tmpc;
-
-    /* At this point W[] contains the sums of each column.  To get the
-     * correct result we must take the extra bits from each column and
-     * carry them down
-     *
-     * Note that while this adds extra code to the multiplier it saves time
-     * since the carry propagation is removed from the above nested loop.
-     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-     * this is slower but on most cryptographic size numbers it is faster.
-     */
-    tmpc = c->dp;
-    for (ix = 1; ix < digs; ix++) {
-      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-
-    /* clear unused */
-    for (; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_mul_digs.c */
-
-/* Start: bn_fast_s_mp_mul_high_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* this is a modified version of fast_s_mp_mul_digs that only produces
- * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
- * to see how it works.
- *
- * This is used in the Barrett reduction since for one of the multiplications
- * only the higher digits were needed.  This essentially halves the work.
- *
- * Based on Algorithm 14.12 on pp.595 of HAC.
- */
-int
-fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  int     oldused, newused, res, pa, pb, ix;
-  mp_word W[512];
-
-  /* calculate size of product and allocate more space if required */
-  newused = a->used + b->used + 1;
-  if (c->alloc < newused) {
-    if ((res = mp_grow (c, newused)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* like the other comba method we compute the columns first */
-  pa = a->used;
-  pb = b->used;
-  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
-  for (ix = 0; ix < pa; ix++) {
-    {
-      register mp_digit tmpx, *tmpy;
-      register int iy;
-      register mp_word *_W;
-
-      /* work todo, that is we only calculate digits that are at "digs" or above  */
-      iy = digs - ix;
-
-      /* copy of word on the left of A[ix] * B[iy] */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = b->dp + iy;
-
-      /* alias for the columns of output.  Offset to be equal to or above the 
-       * smallest digit place requested 
-       */
-      _W = &(W[digs]);
-
-      /* compute column products for digits above the minimum */
-      for (; iy < pb; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-  }
-
-  /* setup dest */
-  oldused = c->used;
-  c->used = newused;
-
-  /* now convert the array W downto what we need */
-  for (ix = digs + 1; ix < newused; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
-
-  for (; ix < oldused; ix++) {
-    c->dp[ix] = 0;
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_mul_high_digs.c */
-
-/* Start: bn_fast_s_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* fast squaring
- *
- * This is the comba method where the columns of the product are computed first
- * then the carries are computed.  This has the effect of making a very simple
- * inner loop that is executed the most
- *
- * W2 represents the outer products and W the inner.  
- *
- * A further optimizations is made because the inner products are of the form
- * "A * B * 2".  The *2 part does not need to be computed until the end which is
- * good because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
-int
-fast_s_mp_sqr (mp_int * a, mp_int * b)
-{
-  int     olduse, newused, res, ix, pa;
-  mp_word W2[512], W[512];
-
-  /* calculate size of product and allocate as required */
-  pa = a->used;
-  newused = pa + pa + 1;
-  if (b->alloc < newused) {
-    if ((res = mp_grow (b, newused)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  /* zero temp buffer (columns) 
-   * Note that there are two buffers.  Since squaring requires
-   * a outter and inner product and the inner product requires 
-   * computing a product and doubling it (a relatively expensive
-   * op to perform n^2 times if you don't have to) the inner and
-   * outer products are computed in different buffers.  This way
-   * the inner product can be doubled using n doublings instead of
-   * n^2
-   */
-  memset (W, 0, newused * sizeof (mp_word));
-  memset (W2, 0, newused * sizeof (mp_word));
-
-/* note optimization
- * values in W2 are only written in even locations which means
- * we can collapse the array to 256 words [and fixup the memset above]
- * provided we also fix up the summations below.  Ideally
- * the fixup loop should be unrolled twice to handle the even/odd 
- * cases, and then a final step to handle odd cases [e.g. newused == odd]
- *
- * This will not only save ~8*256 = 2KB of stack but lower the number of
- * operations required to finally fix up the columns
- */
-
-  /* This computes the inner product.  To simplify the inner N^2 loop
-   * the multiplication by two is done afterwards in the N loop.
-   */
-  for (ix = 0; ix < pa; ix++) {
-    /* compute the outer product 
-     *
-     * Note that every outer product is computed 
-     * for a particular column only once which means that 
-     * there is no need todo a double precision addition
-     */
-    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-
-    {
-      register mp_digit tmpx, *tmpy;
-      register mp_word *_W;
-      register int iy;
-
-      /* copy of left side */
-      tmpx = a->dp[ix];
-
-      /* alias for right side */
-      tmpy = a->dp + (ix + 1);
-
-      /* the column to store the result in */
-      _W = W + (ix + ix + 1);
-
-      /* inner products */
-      for (iy = ix + 1; iy < pa; iy++) {
-	*_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-      }
-    }
-  }
-
-  /* setup dest */
-  olduse = b->used;
-  b->used = newused;
-
-  /* double first value, since the inner products are half of what they should be */
-  W[0] += W[0] + W2[0];
-
-  /* now compute digits */
-  {
-    register mp_digit *tmpb;
-
-    tmpb = b->dp;
-
-    for (ix = 1; ix < newused; ix++) {
-      /* double/add next digit */
-      W[ix] += W[ix] + W2[ix];
-
-      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-    }
-    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-
-    /* clear high */
-    for (; ix < olduse; ix++) {
-      *tmpb++ = 0;
-    }
-  }
-
-  mp_clamp (b);
-  return MP_OKAY;
-}
-
-/* End: bn_fast_s_mp_sqr.c */
-
-/* Start: bn_mp_2expt.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes a = 2^b 
- *
- * Simple algorithm which zeroes the int, grows it then just sets one bit
- * as required.
- */
-int
-mp_2expt (mp_int * a, int b)
-{
-  int     res;
-
-  mp_zero (a);
-  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
-  }
-  a->used = b / DIGIT_BIT + 1;
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_2expt.c */
-
-/* Start: bn_mp_abs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = |a| 
- *
- * Simple function copies the input and fixes the sign to positive
- */
-int
-mp_abs (mp_int * a, mp_int * b)
-{
-  int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
-  b->sign = MP_ZPOS;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_abs.c */
-
-/* Start: bn_mp_add.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level addition (handles signs) */
-int
-mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-  /* get sign of both inputs */
-  sa = a->sign;
-  sb = b->sign;
-
-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a + -b == a - b, but if b>a then we do it as -(b-a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a + b == b - a, but if a>b then we do it as -(a-b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
-    }
-  } else {
-    /* -a + -b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
-  }
-  return res;
-}
-
-/* End: bn_mp_add.c */
-
-/* Start: bn_mp_addmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a + b (mod c) */
-int
-mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_addmod.c */
-
-/* Start: bn_mp_add_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit addition */
-int
-mp_add_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
-    return res;
-  }
-  mp_set (&t, b);
-  res = mp_add (a, &t, c);
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_add_d.c */
-
-/* Start: bn_mp_and.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* AND two ints together */
-int
-mp_and (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] &= x->dp[ix];
-  }
-
-  /* zero digits above the last from the smallest mp_int */
-  for (; ix < t.used; ix++) {
-    t.dp[ix] = 0;
-  }
-
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_and.c */
-
-/* Start: bn_mp_clamp.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* trim unused digits 
- *
- * This is used to ensure that leading zero digits are
- * trimed and the leading "used" digit will be non-zero
- * Typically very fast.  Also fixes the sign if there
- * are no more leading digits
- */
-void
-mp_clamp (mp_int * a)
-{
-  while (a->used > 0 && a->dp[a->used - 1] == 0) {
-    --(a->used);
-  }
-  if (a->used == 0) {
-    a->sign = MP_ZPOS;
-  }
-}
-
-/* End: bn_mp_clamp.c */
-
-/* Start: bn_mp_clear.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* clear one (frees)  */
-void
-mp_clear (mp_int * a)
-{
-  if (a->dp != NULL) {
-
-    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
-
-    /* free ram */
-    free (a->dp);
-
-    /* reset members to make debugging easier */
-    a->dp = NULL;
-    a->alloc = a->used = 0;
-  }
-}
-
-/* End: bn_mp_clear.c */
-
-/* Start: bn_mp_cmp.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare two ints (signed)*/
-int
-mp_cmp (mp_int * a, mp_int * b)
-{
-  /* compare based on sign */
-  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
-    return MP_LT;
-  } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
-    return MP_GT;
-  }
-  return mp_cmp_mag (a, b);
-}
-
-/* End: bn_mp_cmp.c */
-
-/* Start: bn_mp_cmp_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare a digit */
-int
-mp_cmp_d (mp_int * a, mp_digit b)
-{
-
-  if (a->sign == MP_NEG) {
-    return MP_LT;
-  }
-
-  if (a->used > 1) {
-    return MP_GT;
-  }
-
-  if (a->dp[0] > b) {
-    return MP_GT;
-  } else if (a->dp[0] < b) {
-    return MP_LT;
-  } else {
-    return MP_EQ;
-  }
-}
-
-/* End: bn_mp_cmp_d.c */
-
-/* Start: bn_mp_cmp_mag.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* compare maginitude of two ints (unsigned) */
-int
-mp_cmp_mag (mp_int * a, mp_int * b)
-{
-  int     n;
-
-  /* compare based on # of non-zero digits */
-  if (a->used > b->used) {
-    return MP_GT;
-  } else if (a->used < b->used) {
-    return MP_LT;
-  }
-
-  /* compare based on digits  */
-  for (n = a->used - 1; n >= 0; n--) {
-    if (a->dp[n] > b->dp[n]) {
-      return MP_GT;
-    } else if (a->dp[n] < b->dp[n]) {
-      return MP_LT;
-    }
-  }
-  return MP_EQ;
-}
-
-/* End: bn_mp_cmp_mag.c */
-
-/* Start: bn_mp_copy.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* copy, b = a */
-int
-mp_copy (mp_int * a, mp_int * b)
-{
-  int     res, n;
-
-  /* if dst == src do nothing */
-  if (a == b || a->dp == b->dp) {
-    return MP_OKAY;
-  }
-
-  /* grow dest */
-  if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-    return res;
-  }
-
-  /* zero b and copy the parameters over */
-  b->used = a->used;
-  b->sign = a->sign;
-
-  {
-    register mp_digit *tmpa, *tmpb;
-
-    /* point aliases */
-    tmpa = a->dp;
-    tmpb = b->dp;
-
-    /* copy all the digits */
-    for (n = 0; n < a->used; n++) {
-      *tmpb++ = *tmpa++;
-    }
-
-    /* clear high digits */
-    for (; n < b->alloc; n++) {
-      *tmpb++ = 0;
-    }
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_copy.c */
-
-/* Start: bn_mp_count_bits.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* returns the number of bits in an int */
-int
-mp_count_bits (mp_int * a)
-{
-  int     r;
-  mp_digit q;
-
-  if (a->used == 0) {
-    return 0;
-  }
-
-  r = (a->used - 1) * DIGIT_BIT;
-  q = a->dp[a->used - 1];
-  while (q > ((mp_digit) 0)) {
-    ++r;
-    q >>= ((mp_digit) 1);
-  }
-  return r;
-}
-
-/* End: bn_mp_count_bits.c */
-
-/* Start: bn_mp_div.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
- * HAC pp.598 Algorithm 14.20
- *
- * Note that the description in HAC is horribly incomplete.  For example,
- * it doesn't consider the case where digits are removed from 'x' in the inner
- * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
- *
- * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
-*/
-int
-mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  mp_int  q, x, y, t1, t2;
-  int     res, n, t, i, norm, neg;
-
-
-  /* is divisor zero ? */
-  if (mp_iszero (b) == 1) {
-    return MP_VAL;
-  }
-
-  /* if a < b then q=0, r = a */
-  if (mp_cmp_mag (a, b) == MP_LT) {
-    if (d != NULL) {
-      res = mp_copy (a, d);
-    } else {
-      res = MP_OKAY;
-    }
-    if (c != NULL) {
-      mp_zero (c);
-    }
-    return res;
-  }
-
-  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
-    return res;
-  }
-  q.used = a->used + 2;
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    goto __Q;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
-  }
-
-  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
-    goto __T2;
-  }
-
-  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
-    goto __X;
-  }
-
-  /* fix the sign */
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  x.sign = y.sign = MP_ZPOS;
-
-  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
-  norm = mp_count_bits(&y) % DIGIT_BIT;
-  if (norm < (DIGIT_BIT-1)) {
-     norm = (DIGIT_BIT-1) - norm;
-     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
-       goto __Y;
-     }
-     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
-       goto __Y;
-     }
-  } else {
-     norm = 0;
-  }
-     
-  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
-  n = x.used - 1;
-  t = y.used - 1;
-
-  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
-  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) {	/* y = y*b^{n-t} */
-    goto __Y;
-  }
-
-  while (mp_cmp (&x, &y) != MP_LT) {
-    ++(q.dp[n - t]);
-    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-  }
-
-  /* reset y by shifting it back down */
-  mp_rshd (&y, n - t);
-
-  /* step 3. for i from n down to (t + 1) */
-  for (i = n; i >= (t + 1); i--) {
-    if (i > x.used)
-      continue;
-
-    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
-    if (x.dp[i] == y.dp[t]) {
-      q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
-    } else {
-      mp_word tmp;
-      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
-      tmp |= ((mp_word) x.dp[i - 1]);
-      tmp /= ((mp_word) y.dp[t]);
-      if (tmp > (mp_word) MP_MASK)
-	tmp = MP_MASK;
-      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
-    }
-
-    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
-    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
-    do {
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
-
-      /* find left hand */
-      mp_zero (&t1);
-      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
-      t1.dp[1] = y.dp[t];
-      t1.used = 2;
-      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-	goto __Y;
-      }
-
-      /* find right hand */
-      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
-      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
-      t2.dp[2] = x.dp[i];
-      t2.used = 3;
-    } while (mp_cmp (&t1, &t2) == MP_GT);
-
-    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
-    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
-      goto __Y;
-    }
-
-    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
-    if (x.sign == MP_NEG) {
-      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
-	goto __Y;
-      }
-      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
-	goto __Y;
-      }
-      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
-	goto __Y;
-      }
-
-      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
-    }
-  }
-  
-  /* now q is the quotient and x is the remainder [which we have to normalize] */
-  /* get sign before writing to c */
-  x.sign = a->sign;
-
-  if (c != NULL) {
-    mp_clamp (&q);
-    mp_exch (&q, c);
-    c->sign = neg;
-  }
-
-  if (d != NULL) {
-    mp_div_2d (&x, norm, &x, NULL);
-    mp_exch (&x, d);
-  }
-
-  res = MP_OKAY;
-
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-__Q:mp_clear (&q);
-  return res;
-}
-
-/* End: bn_mp_div.c */
-
-/* Start: bn_mp_div_2.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = a/2 */
-int
-mp_div_2 (mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* copy */
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* source alias */
-    tmpa = a->dp + b->used - 1;
-    
-    /* dest alias */
-    tmpb = b->dp + b->used - 1;
-    
-    /* carry */
-    r = 0;
-    for (x = b->used - 1; x >= 0; x--) {
-      /* get the carry for the next iteration */
-      rr = *tmpa & 1;
-      
-      /* shift the current digit, add in carry and store */
-      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-      
-      /* forward carry to next iteration */
-      r = rr;
-    }
-
-    /* zero excess digits */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  mp_clamp (b);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_div_2.c */
-
-/* Start: bn_mp_div_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift right by a certain bit count (store quotient in c, remainder in d) */
-int
-mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-{
-  mp_digit D, r, rr;
-  int     x, res;
-  mp_int  t;
-
-
-  /* if the shift count is <= 0 then we do no work */
-  if (b <= 0) {
-    res = mp_copy (a, c);
-    if (d != NULL) {
-      mp_zero (d);
-    }
-    return res;
-  }
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  /* get the remainder */
-  if (d != NULL) {
-    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
-    mp_rshd (c, b / DIGIT_BIT);
-  }
-
-  /* shift any bit count < DIGIT_BIT */
-  D = (mp_digit) (b % DIGIT_BIT);
-  if (D != 0) {
-    register mp_digit *tmpc, mask;
-    
-    /* mask */
-    mask = (1U << D) - 1U;
-    
-    /* alias */
-    tmpc = c->dp + (c->used - 1);
-    
-    /* carry */
-    r = 0;
-    for (x = c->used - 1; x >= 0; x--) {
-      /* get the lower  bits of this word in a temp */
-      rr = *tmpc & mask;
-
-      /* shift the current word and mix in the carry bits from the previous word */
-      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
-      --tmpc;
-
-      /* set the carry to the carry bits of the current word found above */
-      r = rr;
-    }
-  }
-  mp_clamp (c);
-  res = MP_OKAY;
-  if (d != NULL) {
-    mp_exch (&t, d);
-  }
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_div_2d.c */
-
-/* Start: bn_mp_div_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit division */
-int
-mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
-{
-  mp_int  t, t2;
-  int     res;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  mp_set (&t, b);
-  res = mp_div (a, &t, c, &t2);
-
-  /* set remainder if not null */
-  if (d != NULL) {
-    *d = t2.dp[0];
-  }
-
-  mp_clear (&t);
-  mp_clear (&t2);
-  return res;
-}
-
-/* End: bn_mp_div_d.c */
-
-/* Start: bn_mp_dr_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
- *
- * Based on algorithm from the paper 
- *
- * "Generating Efficient Primes for Discrete Log Cryptosystems"
- *                 Chae Hoon Lim, Pil Loong Lee,
- *          POSTECH Information Research Laboratories
- *
- * The modulus must be of a special format [see manual]
- */
-int
-mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
-{
-  int     err, i, j, k;
-  mp_word r;
-  mp_digit mu, *tmpj, *tmpi;
-
-  /* k = digits in modulus */
-  k = b->used;
-
-  /* ensure that "a" has at least 2k digits */
-  if (a->alloc < k + k) {
-    if ((err = mp_grow (a, k + k)) != MP_OKAY) {
-      return err;
-    }
-  }
- 
-  /* alias for a->dp[i] */
-  tmpi = a->dp + k + k - 1;
-
-  /* for (i = 2k - 1; i >= k; i = i - 1) 
-   *
-   * This is the main loop of the reduction.  Note that at the end
-   * the words above position k are not zeroed as expected.  The end
-   * result is that the digits from 0 to k-1 are the residue.  So 
-   * we have to clear those afterwards.
-   */
-  for (i = k + k - 1; i >= k; i = i - 1) {
-    /* x[i - 1 : i - k] += x[i]*mp */
-
-    /* x[i] * mp */
-    r = ((mp_word) *tmpi--) * ((mp_word) mp);
-
-    /* now add r to x[i-1:i-k] 
-     *
-     * First add it to the first digit x[i-k] then form the carry
-     * then enter the main loop 
-     */
-    j = i - k;
-
-    /* alias for a->dp[j] */
-    tmpj = a->dp + j;
-
-    /* add digit */
-    *tmpj += (mp_digit)(r & MP_MASK);
-
-    /* this is the carry */
-    mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);
-
-    /* clear carry from a->dp[j]  */
-    *tmpj++ &= MP_MASK; 
-
-    /* now add rest of the digits 
-     * 
-     * Note this is basically a simple single digit addition to
-     * a larger multiple digit number.  This is optimized somewhat
-     * because the propagation of carries is not likely to move
-     * more than a few digits. 
-     *
-     */
-    for (++j; mu != 0 && j <= (i - 1); ++j) {
-      *tmpj   += mu;
-      mu       = *tmpj >> DIGIT_BIT;
-      *tmpj++ &= MP_MASK;
-    }
-
-    /* if final carry */
-    if (mu != 0) {
-      /* add mp to this to correct */
-      j = i - k;
-      tmpj = a->dp + j;
-
-      *tmpj += mp;
-      mu = *tmpj >> DIGIT_BIT;
-      *tmpj++ &= MP_MASK;
-      
-      /* now handle carries */
-      for (++j; mu != 0 && j <= (i - 1); j++) {
-	*tmpj   += mu;
-	mu       = *tmpj >> DIGIT_BIT;
-	*tmpj++ &= MP_MASK;
-      }
-    }
-  }
-  
-  /* zero words above k */
-  tmpi = a->dp + k;
-  for (i = k; i < a->used; i++) {
-      *tmpi++ = 0;
-  }
-
-  /* clamp, sub and return */
-  mp_clamp (a);
-  
-  if (mp_cmp_mag (a, b) != MP_LT) {
-    return s_mp_sub (a, b, a);
-  }
-  return MP_OKAY;
-}
-
-/* determines if a number is a valid DR modulus */
-int mp_dr_is_modulus(mp_int *a)
-{
-   int ix;
-   
-   /* must be at least two digits */
-   if (a->used < 2) {
-      return 0;
-   }      
-   
-   for (ix = 1; ix < a->used; ix++) {
-       if (a->dp[ix] != MP_MASK) {
-          return 0;
-       }
-   }
-   return 1;
-}
-
-/* determines the setup value */
-void mp_dr_setup(mp_int *a, mp_digit *d)
-{
-   *d = (1 << DIGIT_BIT) - a->dp[0];
-}
-
-
-/* End: bn_mp_dr_reduce.c */
-
-/* Start: bn_mp_exch.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* swap the elements of two integers, for cases where you can't simply swap the 
- * mp_int pointers around 
- */
-void
-mp_exch (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-
-  t = *a;
-  *a = *b;
-  *b = t;
-}
-
-/* End: bn_mp_exch.c */
-
-/* Start: bn_mp_exptmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
-
-/* this is a shell function that calls either the normal or Montgomery
- * exptmod functions.  Originally the call to the montgomery code was 
- * embedded in the normal function but that wasted alot of stack space
- * for nothing (since 99% of the time the Montgomery code would be called)
- */
-int
-mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  int dr;
-  
-  dr = mp_dr_is_modulus(P);
-  /* if the modulus is odd use the fast method */
-  if (((mp_isodd (P) == 1 && P->used < MONTGOMERY_EXPT_CUTOFF) || dr == 1) && P->used > 4) {
-    return mp_exptmod_fast (G, X, P, Y, dr);
-  } else {
-    return f_mp_exptmod (G, X, P, Y);
-  }
-}
-
-static int
-f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-{
-  mp_int  M[256], res, mu;
-  mp_digit buf;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-
-  /* create mu, used for Barrett reduction */
-  if ((err = mp_init (&mu)) != MP_OKAY) {
-    goto __M;
-  }
-  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* create M table 
-   *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-   *
-   * The first half of the table is not computed though accept for M[0] and M[1]
-   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __MU;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __MU;
-    }
-    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
-      goto __MU;
-    }
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __MU;
-  }
-  mp_set (&res, 1);
-
-  /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
-  digidx = X->used - 1;
-  bitcpy = bitbuf = 0;
-
-  bitcnt = 1;
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-	break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
-
-    /* if the bit is zero and mode == 0 then we ignore it 
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it 
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __MU;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __MU;
-      }
-
-      /* empty window and reset */
-      bitcpy = bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
-      }
-      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
-	  goto __RES;
-	}
-      }
-    }
-  }
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__MU:mp_clear (&mu);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
-
-/* End: bn_mp_exptmod.c */
-
-/* Start: bn_mp_exptmod_fast.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
- *
- * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
- * The value of k changes based on the size of the exponent.
- *
- * Uses Montgomery or Diminished Radix reduction [whichever appropriate] 
- */
-int
-mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
-{
-  mp_int  M[256], res;
-  mp_digit buf, mp;
-  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-  int     (*redux)(mp_int*,mp_int*,mp_digit);
-  
-  /* find window size */
-  x = mp_count_bits (X);
-  if (x <= 7) {
-    winsize = 2;
-  } else if (x <= 36) {
-    winsize = 3;
-  } else if (x <= 140) {
-    winsize = 4;
-  } else if (x <= 450) {
-    winsize = 5;
-  } else if (x <= 1303) {
-    winsize = 6;
-  } else if (x <= 3529) {
-    winsize = 7;
-  } else {
-    winsize = 8;
-  }
-
-  /* init G array */
-  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init (&M[x])) != MP_OKAY) {
-      for (y = 0; y < x; y++) {
-	mp_clear (&M[y]);
-      }
-      return err;
-    }
-  }
-  
-  if (redmode == 0) {
-     /* now setup montgomery  */
-     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
-        goto __M;
-     }
-     redux = mp_montgomery_reduce;
-  } else {
-     /* setup DR reduction */
-     mp_dr_setup(P, &mp);
-     redux = mp_dr_reduce;
-  }
-
-  /* setup result */
-  if ((err = mp_init (&res)) != MP_OKAY) {
-    goto __RES;
-  }
-
-  /* create M table
-   *
-   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
-   *
-   * The first half of the table is not computed though accept for M[0] and M[1]
-   */
-
-  if (redmode == 0) {
-     /* now we need R mod m */
-     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
-       goto __RES;
-     }
-
-     /* now set M[1] to G * R mod m */
-     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
-       goto __RES;
-     }
-  } else {
-     mp_set(&res, 1);
-     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
-        goto __RES;
-     }
-  }
-  
-  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
-  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
-    goto __RES;
-  }
-
-  for (x = 0; x < (winsize - 1); x++) {
-    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
-      goto __RES;
-    }
-    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
-      goto __RES;
-    }
-  }
-
-  /* create upper table */
-  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
-    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
-      goto __RES;
-    }
-    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
-      goto __RES;
-    }
-  }
-
-  /* set initial mode and bit cnt */
-  mode = 0;
-  bitcnt = 0;
-  buf = 0;
-  digidx = X->used - 1;
-  bitcpy = bitbuf = 0;
-
-  bitcnt = 1;
-  for (;;) {
-    /* grab next digit as required */
-    if (--bitcnt == 0) {
-      if (digidx == -1) {
-	break;
-      }
-      buf = X->dp[digidx--];
-      bitcnt = (int) DIGIT_BIT;
-    }
-
-    /* grab the next msb from the exponent */
-    y = (buf >> (DIGIT_BIT - 1)) & 1;
-    buf <<= 1;
-
-    /* if the bit is zero and mode == 0 then we ignore it
-     * These represent the leading zero bits before the first 1 bit
-     * in the exponent.  Technically this opt is not required but it
-     * does lower the # of trivial squaring/reductions used
-     */
-    if (mode == 0 && y == 0)
-      continue;
-
-    /* if the bit is zero and mode == 1 then we square */
-    if (mode == 1 && y == 0) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
-      }
-      continue;
-    }
-
-    /* else we add it to the window */
-    bitbuf |= (y << (winsize - ++bitcpy));
-    mode = 2;
-
-    if (bitcpy == winsize) {
-      /* ok window is filled so square as required and multiply  */
-      /* square first */
-      for (x = 0; x < winsize; x++) {
-	if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
-      }
-
-      /* then multiply */
-      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
-	goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
-      }
-
-      /* empty window and reset */
-      bitcpy = bitbuf = 0;
-      mode = 1;
-    }
-  }
-
-  /* if bits remain then square/multiply */
-  if (mode == 2 && bitcpy > 0) {
-    /* square then multiply if the bit is set */
-    for (x = 0; x < bitcpy; x++) {
-      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
-	goto __RES;
-      }
-      if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	goto __RES;
-      }
-
-      bitbuf <<= 1;
-      if ((bitbuf & (1 << winsize)) != 0) {
-	/* then multiply */
-	if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
-	  goto __RES;
-	}
-	if ((err = redux (&res, P, mp)) != MP_OKAY) {
-	  goto __RES;
-	}
-      }
-    }
-  }
-
-  if (redmode == 0) {
-     /* fixup result */
-     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
-       goto __RES;
-     }
-  }     
-
-  mp_exch (&res, Y);
-  err = MP_OKAY;
-__RES:mp_clear (&res);
-__M:
-  for (x = 0; x < (1 << winsize); x++) {
-    mp_clear (&M[x]);
-  }
-  return err;
-}
-
-/* End: bn_mp_exptmod_fast.c */
-
-/* Start: bn_mp_expt_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calculate c = a^b  using a square-multiply algorithm */
-int
-mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, x;
-  mp_int  g;
-
-  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
-    return res;
-  }
-
-  /* set initial result */
-  mp_set (c, 1);
-
-  for (x = 0; x < (int) DIGIT_BIT; x++) {
-    /* square */
-    if ((res = mp_sqr (c, c)) != MP_OKAY) {
-      mp_clear (&g);
-      return res;
-    }
-
-    /* if the bit is set multiply */    
-    if ((b & (mp_digit) (1 << (DIGIT_BIT - 1))) != 0) {
-      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
-	mp_clear (&g);
-	return res;
-      }
-    }
-
-    /* shift to next bit */
-    b <<= 1;
-  }
-
-  mp_clear (&g);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_expt_d.c */
-
-/* Start: bn_mp_gcd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
- */
-int
-mp_gcd (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  u, v, t;
-  int     k, res, neg;
-
-  /* either zero than gcd is the largest */
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
-    return mp_copy (b, c);
-  }
-  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
-    return mp_copy (a, c);
-  }
-  if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
-    mp_set (c, 1);
-    return MP_OKAY;
-  }
-
-  /* if both are negative they share (-1) as a common divisor */
-  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
-
-  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
-    goto __U;
-  }
-
-  /* must be positive for the remainder of the algorithm */
-  u.sign = v.sign = MP_ZPOS;
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    goto __V;
-  }
-
-  /* B1.  Find power of two */
-  k = 0;
-  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
-    ++k;
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __T;
-    }
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  /* B2.  Initialize */
-  if (mp_isodd(&u) == 1) {
-    /* t = -v */
-    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-    t.sign = MP_NEG;
-  } else {
-    /* t = u */
-    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-
-  do {
-    /* B3 (and B4).  Halve t, if even */
-    while (t.used != 0 && mp_iseven(&t) == 1) {
-      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
-	goto __T;
-      }
-    }
-
-    /* B5.  if t>0 then u=t otherwise v=-t */
-    if (t.used != 0 && t.sign != MP_NEG) {
-      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
-	goto __T;
-      }
-    } else {
-      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
-	goto __T;
-      }
-      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-    }
-
-    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
-    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
-      goto __T;
-    }
-  }
-  while (t.used != 0);
-
-  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
-    goto __T;
-  }
-
-  mp_exch (&u, c);
-  c->sign = neg;
-  res = MP_OKAY;
-__T:mp_clear (&t);
-__V:mp_clear (&u);
-__U:mp_clear (&v);
-  return res;
-}
-
-/* End: bn_mp_gcd.c */
-
-/* Start: bn_mp_grow.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* grow as required */
-int
-mp_grow (mp_int * a, int size)
-{
-  int     i, n;
-
-  /* if the alloc size is smaller alloc more ram */
-  if (a->alloc < size) {
-    /* ensure there are always at least MP_PREC digits extra on top */
-    size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
-
-    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
-    if (a->dp == NULL) {
-      return MP_MEM;
-    }
-
-    /* zero excess digits */
-    n = a->alloc;
-    a->alloc = size;
-    for (i = n; i < a->alloc; i++) {
-      a->dp[i] = 0;
-    }
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_grow.c */
-
-/* Start: bn_mp_init.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* init a new bigint */
-int
-mp_init (mp_int * a)
-{
-
-  /* allocate ram required and clear it */
-  a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-
-  /* set the used to zero, allocated digit to the default precision
-   * and sign to positive */
-  a->used  = 0;
-  a->alloc = MP_PREC;
-  a->sign  = MP_ZPOS;
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_init.c */
-
-/* Start: bn_mp_init_copy.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* creates "a" then copies b into it */
-int
-mp_init_copy (mp_int * a, mp_int * b)
-{
-  int     res;
-
-  if ((res = mp_init (a)) != MP_OKAY) {
-    return res;
-  }
-  return mp_copy (b, a);
-}
-
-/* End: bn_mp_init_copy.c */
-
-/* Start: bn_mp_init_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* init a mp_init and grow it to a given size */
-int
-mp_init_size (mp_int * a, int size)
-{
-
-  /* pad size so there are always extra digits */
-  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
-  
-  /* alloc mem */
-  a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
-  if (a->dp == NULL) {
-    return MP_MEM;
-  }
-  a->used = 0;
-  a->alloc = size;
-  a->sign = MP_ZPOS;
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_init_size.c */
-
-/* Start: bn_mp_invmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-mp_invmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x, y, u, v, A, B, C, D;
-  int     res;
-
-  /* b cannot be negative */
-  if (b->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  /* if the modulus is odd we can use a faster routine instead */
-  if (mp_iseven (b) == 0) {
-    return fast_mp_invmod (a, b, c);
-  }
-
-  if ((res = mp_init (&x)) != MP_OKAY) {
-    goto __ERR;
-  }
-
-  if ((res = mp_init (&y)) != MP_OKAY) {
-    goto __X;
-  }
-
-  if ((res = mp_init (&u)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  if ((res = mp_init (&v)) != MP_OKAY) {
-    goto __U;
-  }
-
-  if ((res = mp_init (&A)) != MP_OKAY) {
-    goto __V;
-  }
-
-  if ((res = mp_init (&B)) != MP_OKAY) {
-    goto __A;
-  }
-
-  if ((res = mp_init (&C)) != MP_OKAY) {
-    goto __B;
-  }
-
-  if ((res = mp_init (&D)) != MP_OKAY) {
-    goto __C;
-  }
-
-  /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (b, &y)) != MP_OKAY) {
-    goto __D;
-  }
-
-  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
-    goto __D;
-  }
-
-  /* 2. [modified] if x,y are both even then return an error! */
-  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
-    res = MP_VAL;
-    goto __D;
-  }
-
-  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
-  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
-    goto __D;
-  }
-  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
-    goto __D;
-  }
-  mp_set (&A, 1);
-  mp_set (&D, 1);
-
-
-top:
-  /* 4.  while u is even do */
-  while (mp_iseven (&u) == 1) {
-    /* 4.1 u = u/2 */
-    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
-      goto __D;
-    }
-    /* 4.2 if A or B is odd then */
-    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
-      /* A = (A+y)/2, B = (B-x)/2 */
-      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
-	goto __D;
-      }
-      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
-	goto __D;
-      }
-    }
-    /* A = A/2, B = B/2 */
-    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
-      goto __D;
-    }
-    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-
-  /* 5.  while v is even do */
-  while (mp_iseven (&v) == 1) {
-    /* 5.1 v = v/2 */
-    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
-      goto __D;
-    }
-    /* 5.2 if C,D are even then */
-    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
-      /* C = (C+y)/2, D = (D-x)/2 */
-      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
-	goto __D;
-      }
-      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
-	goto __D;
-      }
-    }
-    /* C = C/2, D = D/2 */
-    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
-      goto __D;
-    }
-    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-  /* 6.  if u >= v then */
-  if (mp_cmp (&u, &v) != MP_LT) {
-    /* u = u - v, A = A - C, B = B - D */
-    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
-      goto __D;
-    }
-  } else {
-    /* v - v - u, C = C - A, D = D - B */
-    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
-      goto __D;
-    }
-
-    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
-      goto __D;
-    }
-  }
-
-  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
-    goto top;
-
-  /* now a = C, b = D, gcd == g*v */
-
-  /* if v != 1 then there is no inverse */
-  if (mp_cmp_d (&v, 1) != MP_EQ) {
-    res = MP_VAL;
-    goto __D;
-  }
-
-  /* a is now the inverse */
-  mp_exch (&C, c);
-  res = MP_OKAY;
-
-__D:mp_clear (&D);
-__C:mp_clear (&C);
-__B:mp_clear (&B);
-__A:mp_clear (&A);
-__V:mp_clear (&v);
-__U:mp_clear (&u);
-__Y:mp_clear (&y);
-__X:mp_clear (&x);
-__ERR:
-  return res;
-}
-
-/* End: bn_mp_invmod.c */
-
-/* Start: bn_mp_jacobi.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes the jacobi c = (a | n) (or Legendre if b is prime)
- * HAC pp. 73 Algorithm 2.149
- */
-int
-mp_jacobi (mp_int * a, mp_int * n, int *c)
-{
-  mp_int  a1, n1, e;
-  int     s, r, res;
-  mp_digit residue;
-
-  /* step 1.  if a == 0, return 0 */
-  if (mp_iszero (a) == 1) {
-    *c = 0;
-    return MP_OKAY;
-  }
-
-  /* step 2.  if a == 1, return 1 */
-  if (mp_cmp_d (a, 1) == MP_EQ) {
-    *c = 1;
-    return MP_OKAY;
-  }
-
-  /* default */
-  s = 0;
-
-  /* step 3.  write a = a1 * 2^e  */
-  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&n1)) != MP_OKAY) {
-    goto __A1;
-  }
-
-  if ((res = mp_init (&e)) != MP_OKAY) {
-    goto __N1;
-  }
-
-  while (mp_iseven (&a1) == 1) {
-    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
-      goto __E;
-    }
-
-    if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
-      goto __E;
-    }
-  }
-
-  /* step 4.  if e is even set s=1 */
-  if (mp_iseven (&e) == 1) {
-    s = 1;
-  } else {
-    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
-    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-
-    if (residue == 1 || residue == 7) {
-      s = 1;
-    } else if (residue == 3 || residue == 5) {
-      s = -1;
-    }
-  }
-
-  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
-  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
-    goto __E;
-  }
-  if (residue == 3) {
-    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
-      goto __E;
-    }
-    if (residue == 3) {
-      s = -s;
-    }
-  }
-
-  /* if a1 == 1 we're done */
-  if (mp_cmp_d (&a1, 1) == MP_EQ) {
-    *c = s;
-  } else {
-    /* n1 = n mod a1 */
-    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
-      goto __E;
-    }
-    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
-      goto __E;
-    }
-    *c = s * r;
-  }
-
-  /* done */
-  res = MP_OKAY;
-__E:mp_clear (&e);
-__N1:mp_clear (&n1);
-__A1:mp_clear (&a1);
-  return res;
-}
-
-/* End: bn_mp_jacobi.c */
-
-/* Start: bn_mp_karatsuba_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications
- *
- * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b)
- *
- * a = a1 * B^n + a0
- * b = b1 * B^n + b0
- *
- * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0
- *
- * Note that a1b1 and a0b0 are used twice and only need to be computed once.  So in total
- * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0)
- *
- * Note that a multiplication of half the digits requires 1/4th the number of single precision 
- * multiplications so in total after one call 25% of the single precision multiplications are saved.
- * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above
- * the threshold.  This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which
- * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use.  Generally though the 
- * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached.
- */
-int
-mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  x0, x1, y0, y1, t1, t2, x0y0, x1y1;
-  int     B, err;
-
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = MIN (a->used, b->used);
-
-  /* now divide in two */
-  B = B / 2;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-  if (mp_init_size (&y0, B) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-    goto Y0;
-
-  /* init temps */
-  if (mp_init_size (&t1, B * 2) != MP_OKAY)
-    goto Y1;
-  if (mp_init_size (&t2, B * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-    goto T2;
-  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-    goto X0Y0;
-
-  /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
-  x0.used = y0.used = B;
-  x1.used = a->used - B;
-  y1.used = b->used - B;
-
-  {
-    register int x;
-    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-
-    /* we copy the digits directly instead of using higher level functions
-     * since we also need to shift the digits
-     */
-    tmpa = a->dp;
-    tmpb = b->dp;
-
-    tmpx = x0.dp;
-    tmpy = y0.dp;
-    for (x = 0; x < B; x++) {
-      *tmpx++ = *tmpa++;
-      *tmpy++ = *tmpb++;
-    }
-
-    tmpx = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *tmpx++ = *tmpa++;
-    }
-
-    tmpy = y1.dp;
-    for (x = B; x < b->used; x++) {
-      *tmpy++ = *tmpb++;
-    }
-  }
-
-  /* only need to clamp the lower words since by definition the upper words x1/y1 must
-   * have a known number of digits
-   */
-  mp_clamp (&x0);
-  mp_clamp (&y0);
-
-  /* now calc the products x0y0 and x1y1 */
-  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)
-    goto X1Y1;			/* x0y0 = x0*y0 */
-  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1*y1 */
-
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x1 - x0 */
-  if (mp_sub (&y1, &y0, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = y1 - y0 */
-  if (mp_mul (&t1, &t2, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x1 - x0) * (y1 - y0) */
-
-  /* add x0y0 */
-  if (mp_add (&x0y0, &x1y1, &t2) != MP_OKAY)
-    goto X1Y1;			/* t2 = x0y0 + x1y1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1Y1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-    goto X1Y1;			/* x1y1 = x1y1 << 2*B */
-
-  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 */
-  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-    goto X1Y1;			/* t1 = x0y0 + t1 + x1y1 */
-
-  err = MP_OKAY;
-
-X1Y1:mp_clear (&x1y1);
-X0Y0:mp_clear (&x0y0);
-T2:mp_clear (&t2);
-T1:mp_clear (&t1);
-Y1:mp_clear (&y1);
-Y0:mp_clear (&y0);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-
-/* End: bn_mp_karatsuba_mul.c */
-
-/* Start: bn_mp_karatsuba_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Karatsuba squaring, computes b = a*a using three half size squarings
- *
- * See comments of mp_karatsuba_mul for details.  It is essentially the same algorithm
- * but merely tuned to perform recursive squarings.
- */
-int
-mp_karatsuba_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  x0, x1, t1, t2, x0x0, x1x1;
-  int     B, err;
-
-  err = MP_MEM;
-
-  /* min # of digits */
-  B = a->used;
-
-  /* now divide in two */
-  B = B / 2;
-
-  /* init copy all the temps */
-  if (mp_init_size (&x0, B) != MP_OKAY)
-    goto ERR;
-  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-    goto X0;
-
-  /* init temps */
-  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-    goto X1;
-  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-    goto T1;
-  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-    goto T2;
-  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-    goto X0X0;
-
-  {
-    register int x;
-    register mp_digit *dst, *src;
-
-    src = a->dp;
-
-    /* now shift the digits */
-    dst = x0.dp;
-    for (x = 0; x < B; x++) {
-      *dst++ = *src++;
-    }
-
-    dst = x1.dp;
-    for (x = B; x < a->used; x++) {
-      *dst++ = *src++;
-    }
-  }
-
-  x0.used = B;
-  x1.used = a->used - B;
-
-  mp_clamp (&x0);
-
-  /* now calc the products x0*x0 and x1*x1 */
-  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-    goto X1X1;			/* x0x0 = x0*x0 */
-  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-    goto X1X1;			/* x1x1 = x1*x1 */
-
-  /* now calc x1-x0 and y1-y0 */
-  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x1 - x0 */
-  if (mp_sqr (&t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = (x1 - x0) * (y1 - y0) */
-
-  /* add x0y0 */
-  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-    goto X1X1;			/* t2 = x0y0 + x1y1 */
-  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-
-  /* shift by B */
-  if (mp_lshd (&t1, B) != MP_OKAY)
-    goto X1X1;			/* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-    goto X1X1;			/* x1y1 = x1y1 << 2*B */
-
-  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 */
-  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-    goto X1X1;			/* t1 = x0y0 + t1 + x1y1 */
-
-  err = MP_OKAY;
-
-X1X1:mp_clear (&x1x1);
-X0X0:mp_clear (&x0x0);
-T2:mp_clear (&t2);
-T1:mp_clear (&t1);
-X1:mp_clear (&x1);
-X0:mp_clear (&x0);
-ERR:
-  return err;
-}
-
-/* End: bn_mp_karatsuba_sqr.c */
-
-/* Start: bn_mp_lcm.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes least common multiple as a*b/(a, b) */
-int
-mp_lcm (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  res = mp_div (&t, c, c, NULL);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_lcm.c */
-
-/* Start: bn_mp_lshd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift left a certain amount of digits */
-int
-mp_lshd (mp_int * a, int b)
-{
-  int     x, res;
-
-
-  /* if its less than zero return */
-  if (b <= 0) {
-    return MP_OKAY;
-  }
-
-  /* grow to fit the new digits */
-  if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
-    return res;
-  }
-
-  {
-    register mp_digit *tmpa, *tmpaa;
-
-    /* increment the used by the shift amount than copy upwards */
-    a->used += b;
-
-    /* top */
-    tmpa = a->dp + a->used - 1;
-
-    /* base */
-    tmpaa = a->dp + a->used - 1 - b;
-
-    /* much like mp_rshd this is implemented using a sliding window
-     * except the window goes the otherway around.  Copying from
-     * the bottom to the top.  see bn_mp_rshd.c for more info.
-     */
-    for (x = a->used - 1; x >= b; x--) {
-      *tmpa-- = *tmpaa--;
-    }
-
-    /* zero the lower digits */
-    tmpa = a->dp;
-    for (x = 0; x < b; x++) {
-      *tmpa++ = 0;
-    }
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_lshd.c */
-
-/* Start: bn_mp_mod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = a mod b, 0 <= c < b */
-int
-mp_mod (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    res = mp_add (b, &t, c);
-  } else {
-    res = MP_OKAY;
-    mp_exch (&t, c);
-  }
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_mod.c */
-
-/* Start: bn_mp_mod_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calc a value mod 2^b */
-int
-mp_mod_2d (mp_int * a, int b, mp_int * c)
-{
-  int     x, res;
-
-
-  /* if b is <= 0 then zero the int */
-  if (b <= 0) {
-    mp_zero (c);
-    return MP_OKAY;
-  }
-
-  /* if the modulus is larger than the value than return */
-  if (b > (int) (a->used * DIGIT_BIT)) {
-    res = mp_copy (a, c);
-    return res;
-  }
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
-  }
-
-  /* zero digits above the last digit of the modulus */
-  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
-    c->dp[x] = 0;
-  }
-  /* clear the digit that is not completely outside/inside the modulus */
-  c->dp[b / DIGIT_BIT] &=
-    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mod_2d.c */
-
-/* Start: bn_mp_mod_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-int
-mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
-{
-  mp_int  t, t2;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-
-  mp_set (&t, b);
-  mp_div (a, &t, NULL, &t2);
-
-  if (t2.sign == MP_NEG) {
-    if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) {
-      mp_clear (&t);
-      mp_clear (&t2);
-      return res;
-    }
-  }
-  *c = t2.dp[0];
-  mp_clear (&t);
-  mp_clear (&t2);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mod_d.c */
-
-/* Start: bn_mp_montgomery_calc_normalization.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* calculates a = B^n mod b for Montgomery reduction
- * Where B is the base [e.g. 2^DIGIT_BIT].  
- * B^n mod b is computed by first computing
- * A = B^(n-1) which doesn't require a reduction but a simple OR.
- * then C = A * B = B^n is computed by performing upto DIGIT_BIT 
- * shifts with subtractions when the result is greater than b.
- *
- * The method is slightly modified to shift B unconditionally upto just under
- * the leading bit of b.  This saves alot of multiple precision shifting.
- */
-int
-mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
-{
-  int     x, bits, res;
-
-  /* how many bits of last digit does b use */
-  bits = mp_count_bits (b) % DIGIT_BIT;
-
-  /* compute A = B^(n-1) * 2^(bits-1) */
-  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* now compute C = A * B mod b */
-  for (x = bits - 1; x < DIGIT_BIT; x++) {
-    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
-      return res;
-    }
-    if (mp_cmp_mag (a, b) != MP_LT) {
-      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
-	return res;
-      }
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_calc_normalization.c */
-
-/* Start: bn_mp_montgomery_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
-int
-mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
-{
-  int     ix, res, digs;
-  mp_digit ui;
-
-  digs = m->used * 2 + 1;
-  if ((digs < 512)
-      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_mp_montgomery_reduce (a, m, mp);
-  }
-
-  if (a->alloc < m->used * 2 + 1) {
-    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-  a->used = m->used * 2 + 1;
-
-  for (ix = 0; ix < m->used; ix++) {
-    /* ui = ai * m' mod b */
-    ui = (a->dp[ix] * mp) & MP_MASK;
-
-    /* a = a + ui * m * b^i */
-    {
-      register int iy;
-      register mp_digit *tmpx, *tmpy, mu;
-      register mp_word r;
-
-      /* aliases */
-      tmpx = m->dp;
-      tmpy = a->dp + ix;
-
-      mu = 0;
-      for (iy = 0; iy < m->used; iy++) {
-	r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
-	mu = (r >> ((mp_word) DIGIT_BIT));
-	*tmpy++ = (r & ((mp_word) MP_MASK));
-      }
-      /* propagate carries */
-      while (mu) {
-	*tmpy += mu;
-	mu = (*tmpy >> DIGIT_BIT) & 1;
-	*tmpy++ &= MP_MASK;
-      }
-    }
-  }
-
-  /* A = A/b^n */
-  mp_rshd (a, m->used);
-
-  /* if A >= m then A = A - m */
-  if (mp_cmp_mag (a, m) != MP_LT) {
-    return s_mp_sub (a, m, a);
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_reduce.c */
-
-/* Start: bn_mp_montgomery_setup.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* setups the montgomery reduction stuff */
-int
-mp_montgomery_setup (mp_int * a, mp_digit * mp)
-{
-  unsigned long x, b;
-
-/* fast inversion mod 2^32 
- *
- * Based on the fact that 
- *
- * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
- *                   =>  2*X*A - X*X*A*A = 1
- *                   =>  2*(1) - (1)     = 1
- */
-  b = a->dp[0];
-
-  if ((b & 1) == 0) {
-    return MP_VAL;
-  }
-
-  x = (((b + 2) & 4) << 1) + b;	/* here x*a==1 mod 2^4 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^8 */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^16; each step doubles the nb of bits */
-  x *= 2 - b * x;		/* here x*a==1 mod 2^32 */
-
-  /* t = -1/m mod b */
-  *mp = ((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - (x & MP_MASK);
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_montgomery_setup.c */
-
-/* Start: bn_mp_mul.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level multiplication (handles sign) */
-int
-mp_mul (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, neg;
-  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
-    res = mp_karatsuba_mul (a, b, c);
-  } else {
-
-    /* can we use the fast multiplier? 
-     *
-     * The fast multiplier can be used if the output will have less than 
-     * 512 digits and the number of digits won't affect carry propagation
-     */
-    int     digs = a->used + b->used + 1;
-
-    if ((digs < 512)
-	&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-      res = fast_s_mp_mul_digs (a, b, c, digs);
-    } else {
-      res = s_mp_mul (a, b, c);
-    }
-
-  }
-  c->sign = neg;
-  return res;
-}
-
-/* End: bn_mp_mul.c */
-
-/* Start: bn_mp_mulmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a * b (mod c) */
-int
-mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_mulmod.c */
-
-/* Start: bn_mp_mul_2.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = a*2 */
-int
-mp_mul_2 (mp_int * a, mp_int * b)
-{
-  int     x, res, oldused;
-
-  /* Optimization: should copy and shift at the same time */
-
-  if (b->alloc < a->used) {
-    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  oldused = b->used;
-  b->used = a->used;
-
-  /* shift any bit count < DIGIT_BIT */
-  {
-    register mp_digit r, rr, *tmpa, *tmpb;
-
-    /* alias for source */
-    tmpa = a->dp;
-    
-    /* alias for dest */
-    tmpb = b->dp;
-
-    /* carry */
-    r = 0;
-    for (x = 0; x < b->used; x++) {
-    
-      /* get what will be the *next* carry bit from the MSB of the current digit */
-      rr = *tmpa >> (DIGIT_BIT - 1);
-      
-      /* now shift up this digit, add in the carry [from the previous] */
-      *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
-      
-      /* copy the carry that would be from the source digit into the next iteration */
-      r = rr;
-    }
-
-    /* new leading digit? */
-    if (r != 0) {
-      /* do we have to grow to accomodate the new digit? */
-      if (b->alloc == b->used) {
-	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
-	  return res;
-	}
-
-	/* after the grow *tmpb is no longer valid so we have to reset it! 
-	 * (this bug took me about 17 minutes to find...!)
-	 */
-	tmpb = b->dp + b->used;
-      }
-      /* add a MSB which is always 1 at this point */
-      *tmpb = 1;
-      ++b->used;
-    }
-
-    /* now zero any excess digits on the destination that we didn't write to */
-    tmpb = b->dp + b->used;
-    for (x = b->used; x < oldused; x++) {
-      *tmpb++ = 0;
-    }
-  }
-  b->sign = a->sign;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_2.c */
-
-/* Start: bn_mp_mul_2d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift left by a certain bit count */
-int
-mp_mul_2d (mp_int * a, int b, mp_int * c)
-{
-  mp_digit d, r, rr;
-  int     x, res;
-
-  /* copy */
-  if ((res = mp_copy (a, c)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 1)) != MP_OKAY) {
-    return res;
-  }
-
-  /* shift by as many digits in the bit count */
-  if (b >= DIGIT_BIT) {
-    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
-      return res;
-    }
-  }
-  c->used = c->alloc;
-
-  /* shift any bit count < DIGIT_BIT */
-  d = (mp_digit) (b % DIGIT_BIT);
-  if (d != 0) {
-    register mp_digit *tmpc, mask;
-    
-    /* bitmask for carries */
-    mask = (1U << d) - 1U;
-    
-    /* alias */
-    tmpc = c->dp;
-    
-    /* carry */
-    r    = 0;
-    for (x = 0; x < c->used; x++) {
-      /* get the higher bits of the current word */
-      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
-
-      /* shift the current word and OR in the carry */
-      *tmpc = ((*tmpc << d) | r) & MP_MASK;
-      ++tmpc;
-
-      /* set the carry to the carry bits of the current word */
-      r = rr;
-    }
-  }
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_2d.c */
-
-/* Start: bn_mp_mul_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiply by a digit */
-int
-mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  int     res, pa, olduse;
-
-  pa = a->used;
-  if (c->alloc < pa + 1) {
-    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  olduse = c->used;
-  c->used = pa + 1;
-
-  {
-    register mp_digit u, *tmpa, *tmpc;
-    register mp_word r;
-    register int ix;
-
-    tmpc = c->dp + c->used;
-    for (ix = c->used; ix < olduse; ix++) {
-      *tmpc++ = 0;
-    }
-
-    tmpa = a->dp;
-    tmpc = c->dp;
-
-    u = 0;
-    for (ix = 0; ix < pa; ix++) {
-      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
-      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    *tmpc = u;
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_mul_d.c */
-
-/* Start: bn_mp_neg.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* b = -a */
-int
-mp_neg (mp_int * a, mp_int * b)
-{
-  int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
-  }
-  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-  return MP_OKAY;
-}
-
-/* End: bn_mp_neg.c */
-
-/* Start: bn_mp_n_root.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* find the n'th root of an integer 
- *
- * Result found such that (c)^b <= a and (c+1)^b > a 
- *
- * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
- * which will find the root in log(N) time where each step involves a fair bit.  This
- * is not meant to find huge roots [square and cube at most].
- */
-int
-mp_n_root (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t1, t2, t3;
-  int     res, neg;
-
-  /* input must be positive if b is even */
-  if ((b & 1) == 0 && a->sign == MP_NEG) {
-    return MP_VAL;
-  }
-
-  if ((res = mp_init (&t1)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_init (&t2)) != MP_OKAY) {
-    goto __T1;
-  }
-
-  if ((res = mp_init (&t3)) != MP_OKAY) {
-    goto __T2;
-  }
-
-  /* if a is negative fudge the sign but keep track */
-  neg = a->sign;
-  a->sign = MP_ZPOS;
-
-  /* t2 = 2 */
-  mp_set (&t2, 2);
-
-  do {
-    /* t1 = t2 */
-    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
-      goto __T3;
-    }
-
-    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
-    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) */
-      goto __T3;
-    }
-
-    /* numerator */
-    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {	/* t2 = t1^b */
-      goto __T3;
-    }
-
-    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {	/* t2 = t1^b - a */
-      goto __T3;
-    }
-
-    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) * b  */
-      goto __T3;
-    }
-
-    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {	/* t3 = (t1^b - a)/(b * t1^(b-1)) */
-      goto __T3;
-    }
-
-    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
-      goto __T3;
-    }
-  }
-  while (mp_cmp (&t1, &t2) != MP_EQ);
-
-  /* result can be off by a few so check */
-  for (;;) {
-    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
-      goto __T3;
-    }
-
-    if (mp_cmp (&t2, a) == MP_GT) {
-      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-	goto __T3;
-      }
-    } else {
-      break;
-    }
-  }
-
-  /* reset the sign of a first */
-  a->sign = neg;
-
-  /* set the result */
-  mp_exch (&t1, c);
-
-  /* set the sign of the result */
-  c->sign = neg;
-
-  res = MP_OKAY;
-
-__T3:mp_clear (&t3);
-__T2:mp_clear (&t2);
-__T1:mp_clear (&t1);
-  return res;
-}
-
-/* End: bn_mp_n_root.c */
-
-/* Start: bn_mp_or.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* OR two ints together */
-int
-mp_or (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] |= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_or.c */
-
-/* Start: bn_mp_prime_fermat.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* performs one Fermat test.
- * 
- * If "a" were prime then b^a == b (mod a) since the order of
- * the multiplicative sub-group would be phi(a) = a-1.  That means
- * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
- *
- * Sets result to 1 if the congruence holds, or zero otherwise.
- */
-int
-mp_prime_fermat (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  t;
-  int     err;
-
-  /* default to fail */
-  *result = 0;
-
-  /* init t */
-  if ((err = mp_init (&t)) != MP_OKAY) {
-    return err;
-  }
-
-  /* compute t = b^a mod a */
-  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
-    goto __T;
-  }
-
-  /* is it equal to b? */
-  if (mp_cmp (&t, b) == MP_EQ) {
-    *result = 1;
-  }
-
-  err = MP_OKAY;
-__T:mp_clear (&t);
-  return err;
-}
-
-/* End: bn_mp_prime_fermat.c */
-
-/* Start: bn_mp_prime_is_divisible.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* determines if an integers is divisible by one of the first 256 primes or not 
- *
- * sets result to 0 if not, 1 if yes
- */
-int
-mp_prime_is_divisible (mp_int * a, int *result)
-{
-  int     err, ix;
-  mp_digit res;
-
-  /* default to not */
-  *result = 0;
-
-  for (ix = 0; ix < 256; ix++) {
-    /* is it equal to the prime? */
-    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
-      *result = 1;
-      return MP_OKAY;
-    }
-
-    /* what is a mod __prime_tab[ix] */
-    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
-      return err;
-    }
-
-    /* is the residue zero? */
-    if (res == 0) {
-      *result = 1;
-      return MP_OKAY;
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_prime_is_divisible.c */
-
-/* Start: bn_mp_prime_is_prime.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* performs a variable number of rounds of Miller-Rabin
- *
- * Probability of error after t rounds is no more than
- * (1/4)^t when 1 <= t <= 256
- *
- * Sets result to 1 if probably prime, 0 otherwise
- */
-int
-mp_prime_is_prime (mp_int * a, int t, int *result)
-{
-  mp_int  b;
-  int     ix, err, res;
-
-  /* default to no */
-  *result = 0;
-
-  /* valid value of t? */
-  if (t < 1 || t > 256) {
-    return MP_VAL;
-  }
-
-  /* first perform trial division */
-  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
-    return err;
-  }
-  if (res == 1) {
-    return MP_OKAY;
-  }
-
-  /* now perform the miller-rabin rounds */
-  if ((err = mp_init (&b)) != MP_OKAY) {
-    return err;
-  }
-
-  for (ix = 0; ix < t; ix++) {
-    /* set the prime */
-    mp_set (&b, __prime_tab[ix]);
-
-    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
-      goto __B;
-    }
-
-    if (res == 0) {
-      goto __B;
-    }
-  }
-
-  /* passed the test */
-  *result = 1;
-__B:mp_clear (&b);
-  return err;
-}
-
-/* End: bn_mp_prime_is_prime.c */
-
-/* Start: bn_mp_prime_miller_rabin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Miller-Rabin test of "a" to the base of "b" as described in 
- * HAC pp. 139 Algorithm 4.24
- *
- * Sets result to 0 if definitely composite or 1 if probably prime.
- * Randomly the chance of error is no more than 1/4 and often 
- * very much lower.
- */
-int
-mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
-{
-  mp_int  n1, y, r;
-  int     s, j, err;
-
-  /* default */
-  *result = 0;
-
-  /* get n1 = a - 1 */
-  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
-    return err;
-  }
-  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
-    goto __N1;
-  }
-
-  /* set 2^s * r = n1 */
-  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
-    goto __N1;
-  }
-  s = 0;
-  while (mp_iseven (&r) == 1) {
-    ++s;
-    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
-      goto __R;
-    }
-  }
-
-  /* compute y = b^r mod a */
-  if ((err = mp_init (&y)) != MP_OKAY) {
-    goto __R;
-  }
-  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
-    goto __Y;
-  }
-
-  /* if y != 1 and y != n1 do */
-  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
-    j = 1;
-    /* while j <= s-1 and y != n1 */
-    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
-      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
-	goto __Y;
-      }
-
-      /* if y == 1 then composite */
-      if (mp_cmp_d (&y, 1) == MP_EQ) {
-	goto __Y;
-      }
-
-      ++j;
-    }
-
-    /* if y != n1 then composite */
-    if (mp_cmp (&y, &n1) != MP_EQ) {
-      goto __Y;
-    }
-  }
-
-  /* probably prime now */
-  *result = 1;
-__Y:mp_clear (&y);
-__R:mp_clear (&r);
-__N1:mp_clear (&n1);
-  return err;
-}
-
-/* End: bn_mp_prime_miller_rabin.c */
-
-/* Start: bn_mp_prime_next_prime.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* finds the next prime after the number "a" using "t" trials
- * of Miller-Rabin.
- */
-int mp_prime_next_prime(mp_int *a, int t)
-{
-   int err, res;
-   
-   if (mp_iseven(a) == 1) {
-      /* force odd */
-      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
-         return err;
-      }
-   } else {
-      /* force to next number */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
-   }     
-   
-   for (;;) {
-      /* is this prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         return err;
-      }
-      
-      if (res == 1) {
-         break;
-      }
-      
-      /* add two, next candidate */
-      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
-         return err;
-      }
-   }
-   
-   return MP_OKAY;
-}
-
-
-/* End: bn_mp_prime_next_prime.c */
-
-/* Start: bn_mp_rand.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* makes a pseudo-random int of a given size */
-int
-mp_rand (mp_int * a, int digits)
-{
-  int     res;
-  mp_digit d;
-
-  mp_zero (a);
-  if (digits <= 0) {
-    return MP_OKAY;
-  }
-
-  /* first place a random non-zero digit */
-  do {
-    d = ((mp_digit) abs (rand ()));
-  } while (d == 0);
-
-  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
-    return res;
-  }
-
-  while (digits-- > 0) {
-    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
-      return res;
-    }
-
-    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  return MP_OKAY;
-}
-
-/* End: bn_mp_rand.c */
-
-/* Start: bn_mp_read_signed_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* read signed bin, big endian, first byte is 0==positive or 1==negative */
-int
-mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
-{
-  int     res;
-
-  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
-    return res;
-  }
-  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_read_signed_bin.c */
-
-/* Start: bn_mp_read_unsigned_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reads a unsigned char array, assumes the msb is stored first [big endian] */
-int
-mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
-{
-  int     res;
-  mp_zero (a);
-  while (c-- > 0) {
-    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
-      return res;
-    }
-
-    if (DIGIT_BIT != 7) {
-      a->dp[0] |= *b++;
-      a->used += 1;
-    } else {
-      a->dp[0] = (*b & MP_MASK);
-      a->dp[1] |= ((*b++ >> 7U) & 1);
-      a->used += 2;
-    }
-  }
-  mp_clamp (a);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_read_unsigned_bin.c */
-
-/* Start: bn_mp_reduce.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* pre-calculate the value required for Barrett reduction
- * For a given modulus "b" it calulates the value required in "a"
- */
-int
-mp_reduce_setup (mp_int * a, mp_int * b)
-{
-  int     res;
-
-
-  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
-    return res;
-  }
-  res = mp_div (a, b, a, NULL);
-  return res;
-}
-
-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
- * From HAC pp.604 Algorithm 14.42 
- */
-int
-mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-{
-  mp_int  q;
-  int     res, um = m->used;
-
-
-  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
-    return res;
-  }
-
-  mp_rshd (&q, um - 1);		/* q1 = x / b^(k-1)  */
-
-  /* according to HAC this is optimization is ok */
-  if (((unsigned long) m->used) > (1UL << (unsigned long) (DIGIT_BIT - 1UL))) {
-    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  } else {
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
-      goto CLEANUP;
-    }
-  }
-
-  mp_rshd (&q, um + 1);		/* q3 = q2 / b^(k+1) */
-
-  /* x = x mod b^(k+1), quick (no division) */
-  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* q = q * m mod b^(k+1), quick (no division) */
-  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
-    goto CLEANUP;
-  }
-
-  /* x = x - q */
-  if ((res = mp_sub (x, &q, x)) != MP_OKAY)
-    goto CLEANUP;
-
-  /* If x < 0, add b^(k+1) to it */
-  if (mp_cmp_d (x, 0) == MP_LT) {
-    mp_set (&q, 1);
-    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-      goto CLEANUP;
-    if ((res = mp_add (x, &q, x)) != MP_OKAY)
-      goto CLEANUP;
-  }
-
-  /* Back off if it's too big */
-  while (mp_cmp (x, m) != MP_LT) {
-    if ((res = s_mp_sub (x, m, x)) != MP_OKAY)
-      break;
-  }
-
-CLEANUP:
-  mp_clear (&q);
-
-  return res;
-}
-
-/* End: bn_mp_reduce.c */
-
-/* Start: bn_mp_rshd.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shift right a certain amount of digits */
-void
-mp_rshd (mp_int * a, int b)
-{
-  int     x;
-
-  /* if b <= 0 then ignore it */
-  if (b <= 0) {
-    return;
-  }
-
-  /* if b > used then simply zero it and return */
-  if (a->used < b) {
-    mp_zero (a);
-    return;
-  }
-
-  {
-    register mp_digit *tmpa, *tmpaa;
-
-    /* shift the digits down */
-
-    /* base */
-    tmpa = a->dp;
-
-    /* offset into digits */
-    tmpaa = a->dp + b;
-
-    /* this is implemented as a sliding window where the window is b-digits long
-     * and digits from the top of the window are copied to the bottom
-     *
-     * e.g.
-
-     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-                 /\                   |      ---->
-                  \-------------------/      ---->
-     */
-    for (x = 0; x < (a->used - b); x++) {
-      *tmpa++ = *tmpaa++;
-    }
-
-    /* zero the top digits */
-    for (; x < a->used; x++) {
-      *tmpa++ = 0;
-    }
-  }
-  mp_clamp (a);
-}
-
-/* End: bn_mp_rshd.c */
-
-/* Start: bn_mp_set.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set to a digit */
-void
-mp_set (mp_int * a, mp_digit b)
-{
-  mp_zero (a);
-  a->dp[0] = b & MP_MASK;
-  a->used = (a->dp[0] != 0) ? 1 : 0;
-}
-
-/* End: bn_mp_set.c */
-
-/* Start: bn_mp_set_int.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set a 32-bit const */
-int
-mp_set_int (mp_int * a, unsigned long b)
-{
-  int     x, res;
-
-  mp_zero (a);
-
-  /* set four bits at a time, simplest solution to the what if DIGIT_BIT==7 case */
-  for (x = 0; x < 8; x++) {
-
-    /* shift the number up four bits */
-    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
-      return res;
-    }
-
-    /* OR in the top four bits of the source */
-    a->dp[0] |= (b >> 28) & 15;
-
-    /* shift the source up to the next four bits */
-    b <<= 4;
-
-    /* ensure that digits are not clamped off */
-    a->used += 32 / DIGIT_BIT + 1;
-  }
-
-  mp_clamp (a);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_set_int.c */
-
-/* Start: bn_mp_shrink.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* shrink a bignum */
-int
-mp_shrink (mp_int * a)
-{
-  if (a->alloc != a->used) {
-    if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
-      return MP_MEM;
-    }
-    a->alloc = a->used;
-  }
-  return MP_OKAY;
-}
-
-/* End: bn_mp_shrink.c */
-
-/* Start: bn_mp_signed_bin_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* get the size for an signed equivalent */
-int
-mp_signed_bin_size (mp_int * a)
-{
-  return 1 + mp_unsigned_bin_size (a);
-}
-
-/* End: bn_mp_signed_bin_size.c */
-
-/* Start: bn_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* computes b = a*a */
-int
-mp_sqr (mp_int * a, mp_int * b)
-{
-  int     res;
-  if (a->used > KARATSUBA_SQR_CUTOFF) {
-    res = mp_karatsuba_sqr (a, b);
-  } else {
-
-    /* can we use the fast multiplier? */
-    if (((a->used * 2 + 1) < 512)
-	&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
-      res = fast_s_mp_sqr (a, b);
-    } else {
-      res = s_mp_sqr (a, b);
-    }
-  }
-  b->sign = MP_ZPOS;
-  return res;
-}
-
-/* End: bn_mp_sqr.c */
-
-/* Start: bn_mp_sqrmod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* c = a * a (mod b) */
-int
-mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, b, c);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_sqrmod.c */
-
-/* Start: bn_mp_sub.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* high level subtraction (handles signs) */
-int
-mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     sa, sb, res;
-
-
-  sa = a->sign;
-  sb = b->sign;
-
-  /* handle four cases */
-  if (sa == MP_ZPOS && sb == MP_ZPOS) {
-    /* both positive, a - b, but if b>a then we do -(b - a) */
-    if (mp_cmp_mag (a, b) == MP_LT) {
-      /* b>a */
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_ZPOS;
-    }
-  } else if (sa == MP_ZPOS && sb == MP_NEG) {
-    /* a - -b == a + b  */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_ZPOS;
-  } else if (sa == MP_NEG && sb == MP_ZPOS) {
-    /* -a - b == -(a + b) */
-    res = s_mp_add (a, b, c);
-    c->sign = MP_NEG;
-  } else {
-    /* -a - -b == b - a, but if a>b == -(a - b) */
-    if (mp_cmp_mag (a, b) == MP_GT) {
-      res = s_mp_sub (a, b, c);
-      c->sign = MP_NEG;
-    } else {
-      res = s_mp_sub (b, a, c);
-      c->sign = MP_ZPOS;
-    }
-  }
-
-  return res;
-}
-
-/* End: bn_mp_sub.c */
-
-/* Start: bn_mp_submod.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* d = a - b (mod c) */
-int
-mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
-{
-  int     res;
-  mp_int  t;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-
-  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
-  }
-  res = mp_mod (&t, c, d);
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_submod.c */
-
-/* Start: bn_mp_sub_d.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* single digit subtraction */
-int
-mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
-{
-  mp_int  t;
-  int     res;
-
-
-  if ((res = mp_init (&t)) != MP_OKAY) {
-    return res;
-  }
-  mp_set (&t, b);
-  res = mp_sub (a, &t, c);
-
-  mp_clear (&t);
-  return res;
-}
-
-/* End: bn_mp_sub_d.c */
-
-/* Start: bn_mp_to_signed_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* store in signed [big endian] format */
-int
-mp_to_signed_bin (mp_int * a, unsigned char *b)
-{
-  int     res;
-
-  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
-    return res;
-  }
-  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_to_signed_bin.c */
-
-/* Start: bn_mp_to_unsigned_bin.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* store in unsigned [big endian] format */
-int
-mp_to_unsigned_bin (mp_int * a, unsigned char *b)
-{
-  int     x, res;
-  mp_int  t;
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  x = 0;
-  while (mp_iszero (&t) == 0) {
-    if (DIGIT_BIT != 7) {
-      b[x++] = (unsigned char) (t.dp[0] & 255);
-    } else {
-      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
-    }
-    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-  }
-  bn_reverse (b, x);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_to_unsigned_bin.c */
-
-/* Start: bn_mp_unsigned_bin_size.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* get the size for an unsigned equivalent */
-int
-mp_unsigned_bin_size (mp_int * a)
-{
-  int     size = mp_count_bits (a);
-  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
-}
-
-/* End: bn_mp_unsigned_bin_size.c */
-
-/* Start: bn_mp_xor.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* XOR two ints together */
-int
-mp_xor (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     res, ix, px;
-  mp_int  t, *x;
-
-  if (a->used > b->used) {
-    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-      return res;
-    }
-    px = b->used;
-    x = b;
-  } else {
-    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
-      return res;
-    }
-    px = a->used;
-    x = a;
-  }
-
-  for (ix = 0; ix < px; ix++) {
-    t.dp[ix] ^= x->dp[ix];
-  }
-  mp_clamp (&t);
-  mp_exch (c, &t);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_mp_xor.c */
-
-/* Start: bn_mp_zero.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* set to zero */
-void
-mp_zero (mp_int * a)
-{
-  a->sign = MP_ZPOS;
-  a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-}
-
-/* End: bn_mp_zero.c */
-
-/* Start: bn_prime_tab.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-const mp_digit __prime_tab[] = {
-  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
-  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
-  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
-  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F, 0x0083,
-  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
-  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
-  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
-  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
-
-  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
-  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
-  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
-  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
-  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
-  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
-  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
-  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
-
-  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
-  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
-  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
-  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
-  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
-  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
-  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
-  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
-
-  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
-  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
-  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
-  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
-  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
-  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
-  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
-  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
-};
-
-/* End: bn_prime_tab.c */
-
-/* Start: bn_radix.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* chars used in radix conversions */
-static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
-
-/* read a string [ASCII] in a given radix */
-int
-mp_read_radix (mp_int * a, char *str, int radix)
-{
-  int     y, res, neg;
-  char    ch;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if (*str == '-') {
-    ++str;
-    neg = MP_NEG;
-  } else {
-    neg = MP_ZPOS;
-  }
-
-  mp_zero (a);
-  while (*str) {
-    ch = (char) ((radix < 36) ? toupper (*str) : *str);
-    for (y = 0; y < 64; y++) {
-      if (ch == s_rmap[y]) {
-	break;
-      }
-    }
-
-    if (y < radix) {
-      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
-	return res;
-      }
-      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
-	return res;
-      }
-    } else {
-      break;
-    }
-    ++str;
-  }
-  a->sign = neg;
-  return MP_OKAY;
-}
-
-/* stores a bignum as a ASCII string in a given radix (2..64) */
-int
-mp_toradix (mp_int * a, char *str, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-  char   *_s = str;
-
-  if (radix < 2 || radix > 64) {
-    return MP_VAL;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
-  }
-
-  if (t.sign == MP_NEG) {
-    ++_s;
-    *str++ = '-';
-    t.sign = MP_ZPOS;
-  }
-
-  digs = 0;
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return res;
-    }
-    *str++ = s_rmap[d];
-    ++digs;
-  }
-  bn_reverse ((unsigned char *)_s, digs);
-  *str++ = '\0';
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* returns size of ASCII reprensentation */
-int
-mp_radix_size (mp_int * a, int radix)
-{
-  int     res, digs;
-  mp_int  t;
-  mp_digit d;
-
-  /* special case for binary */
-  if (radix == 2) {
-    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
-  }
-
-  if (radix < 2 || radix > 64) {
-    return 0;
-  }
-
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return 0;
-  }
-
-  digs = 0;
-  if (t.sign == MP_NEG) {
-    ++digs;
-    t.sign = MP_ZPOS;
-  }
-
-  while (mp_iszero (&t) == 0) {
-    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
-      mp_clear (&t);
-      return 0;
-    }
-    ++digs;
-  }
-  mp_clear (&t);
-  return digs + 1;
-}
-
-/* End: bn_radix.c */
-
-/* Start: bn_reverse.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* reverse an array, used for radix code */
-void
-bn_reverse (unsigned char *s, int len)
-{
-  int     ix, iy;
-  unsigned char t;
-
-  ix = 0;
-  iy = len - 1;
-  while (ix < iy) {
-    t = s[ix];
-    s[ix] = s[iy];
-    s[iy] = t;
-    ++ix;
-    --iy;
-  }
-}
-
-/* End: bn_reverse.c */
-
-/* Start: bn_s_mp_add.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level addition, based on HAC pp.594, Algorithm 14.7 */
-int
-s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-{
-  mp_int *x;
-  int     olduse, res, min, max;
-
-  /* find sizes, we let |a| <= |b| which means we have to sort
-   * them.  "x" will point to the input with the most digits
-   */
-  if (a->used > b->used) {
-    min = b->used;
-    max = a->used;
-    x = a;
-  } else if (a->used < b->used) {
-    min = a->used;
-    max = b->used;
-    x = b;
-  } else {
-    min = max = a->used;
-    x = NULL;
-  }
-
-  /* init result */
-  if (c->alloc < max + 1) {
-    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
-      return res;
-    }
-  }
-
-  olduse = c->used;
-  c->used = max + 1;
-
-  /* add digits from lower part */
-
-  /* set the carry to zero */
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-
-    /* first input */
-    tmpa = a->dp;
-
-    /* second input */
-    tmpb = b->dp;
-
-    /* destination */
-    tmpc = c->dp;
-
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-      *tmpc = *tmpa++ + *tmpb++ + u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> DIGIT_BIT;
-
-      /* take away carry bit from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
-    if (min != max) {
-      for (; i < max; i++) {
-	/* T[i] = X[i] + U */
-	*tmpc = x->dp[i] + u;
-
-	/* U = carry bit of T[i] */
-	u = *tmpc >> DIGIT_BIT;
-
-	/* take away carry bit from T[i] */
-	*tmpc++ &= MP_MASK;
-      }
-    }
-
-    /* add carry */
-    *tmpc++ = u;
-
-    /* clear digits above used (since we may not have grown result above) */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_add.c */
-
-/* Start: bn_s_mp_mul_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplies |a| * |b| and only computes upto digs digits of result
- * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
- * output are created.  
- */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
-    return res;
-  }
-  t.used = digs;
-
-  /* compute the digits of the product directly */
-  pa = a->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* set the carry to zero */
-    u = 0;
-
-    /* limit ourselves to making digs digits of output */
-    pb = MIN (b->used, digs - ix);
-
-    /* setup some aliases */
-    tmpx = a->dp[ix];
-    tmpt = &(t.dp[ix]);
-    tmpy = b->dp;
-
-    /* compute the columns of the output and propagate the carry */
-    for (iy = 0; iy < pb; iy++) {
-      /* compute the column as a mp_word */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
-
-      /* the new column is the lower part of the result */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get the carry word from the result */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    if (ix + iy < digs)
-      *tmpt = u;
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, c);
-
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_mul_digs.c */
-
-/* Start: bn_s_mp_mul_high_digs.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* multiplies |a| * |b| and does not compute the lower digs digits 
- * [meant to get the higher part of the product]
- */
-int
-s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-{
-  mp_int  t;
-  int     res, pa, pb, ix, iy;
-  mp_digit u;
-  mp_word r;
-  mp_digit tmpx, *tmpt, *tmpy;
-
-
-  /* can we use the fast multiplier? */
-  if (((a->used + b->used + 1) < 512)
-      && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    return fast_s_mp_mul_high_digs (a, b, c, digs);
-  }
-
-  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
-    return res;
-  }
-  t.used = a->used + b->used + 1;
-
-  pa = a->used;
-  pb = b->used;
-  for (ix = 0; ix < pa; ix++) {
-    /* clear the carry */
-    u = 0;
-
-    /* left hand side of A[ix] * B[iy] */
-    tmpx = a->dp[ix];
-
-    /* alias to the address of where the digits will be stored */
-    tmpt = &(t.dp[digs]);
-
-    /* alias for where to read the right hand side from */
-    tmpy = b->dp + (digs - ix);
-
-    for (iy = digs - ix; iy < pb; iy++) {
-      /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
-
-      /* get the lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* carry the carry */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-    }
-    *tmpt = u;
-  }
-  mp_clamp (&t);
-  mp_exch (&t, c);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_mul_high_digs.c */
-
-/* Start: bn_s_mp_sqr.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
-{
-  mp_int  t;
-  int     res, ix, iy, pa;
-  mp_word r, u;
-  mp_digit tmpx, *tmpt;
-
-  pa = a->used;
-  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
-    return res;
-  }
-  t.used = pa + pa + 1;
-
-  for (ix = 0; ix < pa; ix++) {
-    /* first calculate the digit at 2*ix */
-    /* calculate double precision result */
-    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-
-    /* store lower part in result */
-    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-
-    /* get the carry */
-    u = (r >> ((mp_word) DIGIT_BIT));
-
-    /* left hand side of A[ix] * A[iy] */
-    tmpx = a->dp[ix];
-
-    /* alias for where to store the results */
-    tmpt = &(t.dp[ix + ix + 1]);
-    for (iy = ix + 1; iy < pa; iy++) {
-      /* first calculate the product */
-      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
-
-      /* now calculate the double precision result, note we use
-       * addition instead of *2 since its easier to optimize
-       */
-      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
-
-      /* store lower part */
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-
-      /* get carry */
-      u = (r >> ((mp_word) DIGIT_BIT));
-    }
-    r = ((mp_word) * tmpt) + u;
-    *tmpt = (mp_digit) (r & ((mp_word) MP_MASK));
-    u = (r >> ((mp_word) DIGIT_BIT));
-    /* propagate upwards */
-    ++tmpt;
-    while (u != ((mp_word) 0)) {
-      r = ((mp_word) * tmpt) + ((mp_word) 1);
-      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u = (r >> ((mp_word) DIGIT_BIT));
-    }
-  }
-
-  mp_clamp (&t);
-  mp_exch (&t, b);
-  mp_clear (&t);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_sqr.c */
-
-/* Start: bn_s_mp_sub.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
-int
-s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-{
-  int     olduse, res, min, max;
-
-  /* find sizes */
-  min = b->used;
-  max = a->used;
-
-  /* init result */
-  if (c->alloc < max) {
-    if ((res = mp_grow (c, max)) != MP_OKAY) {
-      return res;
-    }
-  }
-  olduse = c->used;
-  c->used = max;
-
-  /* sub digits from lower part */
-
-  {
-    register mp_digit u, *tmpa, *tmpb, *tmpc;
-    register int i;
-
-    /* alias for digit pointers */
-    tmpa = a->dp;
-    tmpb = b->dp;
-    tmpc = c->dp;
-
-    /* set carry to zero */
-    u = 0;
-    for (i = 0; i < min; i++) {
-      /* T[i] = A[i] - B[i] - U */
-      *tmpc = *tmpa++ - *tmpb++ - u;
-
-      /* U = carry bit of T[i] 
-       * Note this saves performing an AND operation since 
-       * if a carry does occur it will propagate all the way to the
-       * MSB.  As a result a single shift is required to get the carry
-       */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* now copy higher words if any, e.g. if A has more digits than B  */
-    for (; i < max; i++) {
-      /* T[i] = A[i] - U */
-      *tmpc = *tmpa++ - u;
-
-      /* U = carry bit of T[i] */
-      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
-
-      /* Clear carry from T[i] */
-      *tmpc++ &= MP_MASK;
-    }
-
-    /* clear digits above used (since we may not have grown result above) */
-    for (i = c->used; i < olduse; i++) {
-      *tmpc++ = 0;
-    }
-  }
-
-  mp_clamp (c);
-  return MP_OKAY;
-}
-
-/* End: bn_s_mp_sub.c */
-
-
-/* EOF */
+/* Start: bn_fast_mp_invmod.c */
+#line 0 "bn_fast_mp_invmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on mp_invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
+int
+fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;
+
+  /* init all our temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &B, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x == modulus, y == value to invert */
+  if ((res = mp_copy (b, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* we need y = |a| */
+  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! 
+   * 
+   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
+   */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __ERR;
+  }
+  mp_set (&D, 1);
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&B) == 0) {
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+        goto __ERR;
+      }
+    }
+    /* B = B/2 */
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&D) == 0) {
+      /* D = (D-x)/2 */
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+        goto __ERR;
+      }
+    }
+    /* D = D/2 */
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  } else {
+    /* v - v - u, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0) {
+    goto top;
+  }
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* b is now the inverse */
+  neg = a->sign;
+  while (D.sign == MP_NEG) {
+    if ((res = mp_add (&D, b, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+  mp_exch (&D, c);
+  c->sign = neg;
+  res = MP_OKAY;
+
+__ERR:mp_clear_multi (&x, &y, &u, &v, &B, &D, NULL);
+  return res;
+}
+
+/* End: bn_fast_mp_invmod.c */
+
+/* Start: bn_fast_mp_montgomery_reduce.c */
+#line 0 "bn_fast_mp_montgomery_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction 
+ * 
+ * This is an optimized implementation of mp_montgomery_reduce 
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.  
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
+int
+fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+{
+  int     ix, res, olduse;
+  mp_word W[MP_WARRAY];
+
+  /* get old used count */
+  olduse = a->used;
+
+  /* grow a as required */
+  if (a->alloc < m->used + 1) {
+    if ((res = mp_grow (a, m->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  {
+    register mp_word *_W;
+    register mp_digit *tmpa;
+
+    _W = W;
+    tmpa = a->dp;
+
+    /* copy the digits of a into W[0..a->used-1] */
+    for (ix = 0; ix < a->used; ix++) {
+      *_W++ = *tmpa++;
+    }
+
+    /* zero the high words of W[a->used..m->used*2] */
+    for (; ix < m->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
+  }
+
+  for (ix = 0; ix < m->used; ix++) {
+    /* ui = ai * m' mod b
+     *
+     * We avoid a double precision multiplication (which isn't required)
+     * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+     * the carry cleared (see after the inner loop)
+     */
+    register mp_digit ui;
+    ui = (((mp_digit) (W[ix] & MP_MASK)) * mp) & MP_MASK;
+
+    /* a = a + ui * m * b^i
+     *
+     * This is computed in place and on the fly.  The multiplication
+     * by b^i is handled by offseting which columns the results
+     * are added to.
+     *
+     * Note the comba method normally doesn't handle carries in the inner loop
+     * In this case we fix the carry from the previous column since the Montgomery
+     * reduction requires digits of the result (so far) [see above] to work.  This is
+     * handled by fixing up one carry after the inner loop.  The carry fixups are done
+     * in order so after these loops the first m->used words of W[] have the carries
+     * fixed
+     */
+    {
+      register int iy;
+      register mp_digit *tmpx;
+      register mp_word *_W;
+
+      /* alias for the digits of the modulus */
+      tmpx = m->dp;
+
+      /* Alias for the columns set by an offset of ix */
+      _W = W + ix;
+
+      /* inner loop */
+      for (iy = 0; iy < m->used; iy++) {
+    *_W++ += ((mp_word) ui) * ((mp_word) * tmpx++);
+      }
+    }
+
+    /* now fix carry for next digit, W[ix+1] */
+    W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
+  }
+
+
+  {
+    register mp_digit *tmpa;
+    register mp_word *_W, *_W1;
+
+    /* nox fix rest of carries */
+    _W1 = W + ix;
+    _W = W + ++ix;
+
+    for (; ix <= m->used * 2 + 1; ix++) {
+      *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
+    }
+
+    /* copy out, A = A/b^n
+     *
+     * The result is A/b^n but instead of converting from an array of mp_word
+     * to mp_digit than calling mp_rshd we just copy them in the right
+     * order
+     */
+    tmpa = a->dp;
+    _W = W + m->used;
+
+    for (ix = 0; ix < m->used + 1; ix++) {
+      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits */
+    for (; ix < olduse; ix++) {
+      *tmpa++ = 0;
+    }
+  }
+
+  /* set the max used and clamp */
+  a->used = m->used + 1;
+  mp_clamp (a);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (a, m) != MP_LT) {
+    return s_mp_sub (a, m, a);
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_fast_mp_montgomery_reduce.c */
+
+/* Start: bn_fast_s_mp_mul_digs.c */
+#line 0 "bn_fast_s_mp_mul_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Fast (comba) multiplier
+ *
+ * This is the fast column-array [comba] multiplier.  It is 
+ * designed to compute the columns of the product first 
+ * then handle the carries afterwards.  This has the effect 
+ * of making the nested loops that compute the columns very
+ * simple and schedulable on super-scalar processors.
+ *
+ * This has been modified to produce a variable number of 
+ * digits of output so if say only a half-product is required 
+ * you don't have to compute the upper half (a feature 
+ * required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
+ */
+int
+fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     olduse, res, pa, ix;
+  mp_word W[MP_WARRAY];
+
+  /* grow the destination as required */
+  if (c->alloc < digs) {
+    if ((res = mp_grow (c, digs)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* clear temp buf (the columns) */
+  memset (W, 0, sizeof (mp_word) * digs);
+
+  /* calculate the columns */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* this multiplier has been modified to allow you to 
+     * control how many digits of output are produced.  
+     * So at most we want to make upto "digs" digits of output.
+     *
+     * this adds products to distinct columns (at ix+iy) of W
+     * note that each step through the loop is not dependent on
+     * the previous which means the compiler can easily unroll
+     * the loop without scheduling problems
+     */
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy, pb;
+
+      /* alias for the the word on the left e.g. A[ix] * A[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for the right side */
+      tmpy = b->dp;
+
+      /* alias for the columns, each step through the loop adds a new
+         term to each column
+       */
+      _W = W + ix;
+
+      /* the number of digits is limited by their placement.  E.g.
+         we avoid multiplying digits that will end up above the # of
+         digits of precision requested
+       */
+      pb = MIN (b->used, digs - ix);
+
+      for (iy = 0; iy < pb; iy++) {
+        *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+
+  }
+
+  /* setup dest */
+  olduse = c->used;
+  c->used = digs;
+
+  {
+    register mp_digit *tmpc;
+
+    /* At this point W[] contains the sums of each column.  To get the
+     * correct result we must take the extra bits from each column and
+     * carry them down
+     *
+     * Note that while this adds extra code to the multiplier it 
+     * saves time since the carry propagation is removed from the 
+     * above nested loop.This has the effect of reducing the work 
+     * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+     * cost of the shifting.  On very small numbers this is slower 
+     * but on most cryptographic size numbers it is faster.
+     */
+    tmpc = c->dp;
+    for (ix = 1; ix < digs; ix++) {
+      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
+
+    /* clear unused */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_mul_digs.c */
+
+/* Start: bn_fast_s_mp_mul_high_digs.c */
+#line 0 "bn_fast_s_mp_mul_high_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* this is a modified version of fast_s_mp_mul_digs that only produces
+ * output digits *above* digs.  See the comments for fast_s_mp_mul_digs
+ * to see how it works.
+ *
+ * This is used in the Barrett reduction since for one of the multiplications
+ * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ */
+int
+fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  int     oldused, newused, res, pa, pb, ix;
+  mp_word W[MP_WARRAY];
+
+  /* calculate size of product and allocate more space if required */
+  newused = a->used + b->used + 1;
+  if (c->alloc < newused) {
+    if ((res = mp_grow (c, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* like the other comba method we compute the columns first */
+  pa = a->used;
+  pb = b->used;
+  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+  for (ix = 0; ix < pa; ix++) {
+    {
+      register mp_digit tmpx, *tmpy;
+      register int iy;
+      register mp_word *_W;
+
+      /* work todo, that is we only calculate digits that are at "digs" or above  */
+      iy = digs - ix;
+
+      /* copy of word on the left of A[ix] * B[iy] */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = b->dp + iy;
+     
+      /* alias for the columns of output.  Offset to be equal to or above the 
+       * smallest digit place requested 
+       */
+      _W = W + digs;     
+      
+      /* skip cases below zero where ix > digs */
+      if (iy < 0) {
+         iy    = abs(iy);
+         tmpy += iy;
+         _W   += iy;
+         iy    = 0;
+      }
+
+      /* compute column products for digits above the minimum */
+      for (; iy < pb; iy++) {
+    *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  oldused = c->used;
+  c->used = newused;
+
+  /* now convert the array W downto what we need */
+  for (ix = digs + 1; ix < newused; ix++) {
+    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+  }
+  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+
+  for (; ix < oldused; ix++) {
+    c->dp[ix] = 0;
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_mul_high_digs.c */
+
+/* Start: bn_fast_s_mp_sqr.c */
+#line 0 "bn_fast_s_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* fast squaring
+ *
+ * This is the comba method where the columns of the product are computed first
+ * then the carries are computed.  This has the effect of making a very simple
+ * inner loop that is executed the most
+ *
+ * W2 represents the outer products and W the inner.
+ *
+ * A further optimizations is made because the inner products are of the form
+ * "A * B * 2".  The *2 part does not need to be computed until the end which is
+ * good because 64-bit shifts are slow!
+ *
+ * Based on Algorithm 14.16 on pp.597 of HAC.
+ *
+ */
+int
+fast_s_mp_sqr (mp_int * a, mp_int * b)
+{
+  int     olduse, newused, res, ix, pa;
+  mp_word W2[MP_WARRAY], W[MP_WARRAY];
+
+  /* calculate size of product and allocate as required */
+  pa = a->used;
+  newused = pa + pa + 1;
+  if (b->alloc < newused) {
+    if ((res = mp_grow (b, newused)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* zero temp buffer (columns)
+   * Note that there are two buffers.  Since squaring requires
+   * a outter and inner product and the inner product requires
+   * computing a product and doubling it (a relatively expensive
+   * op to perform n^2 times if you don't have to) the inner and
+   * outer products are computed in different buffers.  This way
+   * the inner product can be doubled using n doublings instead of
+   * n^2
+   */
+  memset (W, 0, newused * sizeof (mp_word));
+  memset (W2, 0, newused * sizeof (mp_word));
+
+/* note optimization
+ * values in W2 are only written in even locations which means
+ * we can collapse the array to 256 words [and fixup the memset above]
+ * provided we also fix up the summations below.  Ideally
+ * the fixup loop should be unrolled twice to handle the even/odd
+ * cases, and then a final step to handle odd cases [e.g. newused == odd]
+ *
+ * This will not only save ~8*256 = 2KB of stack but lower the number of
+ * operations required to finally fix up the columns
+ */
+
+  /* This computes the inner product.  To simplify the inner N^2 loop
+   * the multiplication by two is done afterwards in the N loop.
+   */
+  for (ix = 0; ix < pa; ix++) {
+    /* compute the outer product
+     *
+     * Note that every outer product is computed
+     * for a particular column only once which means that
+     * there is no need todo a double precision addition
+     */
+    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    {
+      register mp_digit tmpx, *tmpy;
+      register mp_word *_W;
+      register int iy;
+
+      /* copy of left side */
+      tmpx = a->dp[ix];
+
+      /* alias for right side */
+      tmpy = a->dp + (ix + 1);
+
+      /* the column to store the result in */
+      _W = W + (ix + ix + 1);
+
+      /* inner products */
+      for (iy = ix + 1; iy < pa; iy++) {
+          *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+      }
+    }
+  }
+
+  /* setup dest */
+  olduse = b->used;
+  b->used = newused;
+
+  /* double first value, since the inner products are half of what they should be */
+  W[0] += W[0] + W2[0];
+
+  /* now compute digits */
+  {
+    register mp_digit *tmpb;
+
+    tmpb = b->dp;
+
+    for (ix = 1; ix < newused; ix++) {
+      /* double/add next digit */
+      W[ix] += W[ix] + W2[ix];
+
+      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+
+    /* clear high */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
+  }
+
+  mp_clamp (b);
+  return MP_OKAY;
+}
+
+/* End: bn_fast_s_mp_sqr.c */
+
+/* Start: bn_mp_2expt.c */
+#line 0 "bn_mp_2expt.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes a = 2^b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
+int
+mp_2expt (mp_int * a, int b)
+{
+  int     res;
+
+  mp_zero (a);
+  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
+    return res;
+  }
+  a->used = b / DIGIT_BIT + 1;
+  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_2expt.c */
+
+/* Start: bn_mp_abs.c */
+#line 0 "bn_mp_abs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
+int
+mp_abs (mp_int * a, mp_int * b)
+{
+  int     res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = MP_ZPOS;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_abs.c */
+
+/* Start: bn_mp_add.c */
+#line 0 "bn_mp_add.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level addition (handles signs) */
+int
+mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  /* get sign of both inputs */
+  sa = a->sign;
+  sb = b->sign;
+
+  /* handle two cases, not four */
+  if (sa == sb) {
+    /* both positive or both negative */
+    /* add their magnitudes, copy the sign */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* one positive, the other negative */
+    /* subtract the one with the greater magnitude from */
+    /* the one of the lesser magnitude.  The result gets */
+    /* the sign of the one with the greater magnitude. */
+    if (mp_cmp_mag (a, b) == MP_LT) {
+      c->sign = sb;
+      res = s_mp_sub (b, a, c);
+    } else {
+      c->sign = sa;
+      res = s_mp_sub (a, b, c);
+    }
+  }
+  return res;
+}
+
+
+/* End: bn_mp_add.c */
+
+/* Start: bn_mp_add_d.c */
+#line 0 "bn_mp_add_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit addition */
+int
+mp_add_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+  if ((res = mp_init_size(&t, 1)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_add (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_add_d.c */
+
+/* Start: bn_mp_addmod.c */
+#line 0 "bn_mp_addmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a + b (mod c) */
+int
+mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_add (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_addmod.c */
+
+/* Start: bn_mp_and.c */
+#line 0 "bn_mp_and.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* AND two ints together */
+int
+mp_and (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] &= x->dp[ix];
+  }
+
+  /* zero digits above the last from the smallest mp_int */
+  for (; ix < t.used; ix++) {
+    t.dp[ix] = 0;
+  }
+
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_and.c */
+
+/* Start: bn_mp_clamp.c */
+#line 0 "bn_mp_clamp.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
+void
+mp_clamp (mp_int * a)
+{
+  while (a->used > 0 && a->dp[a->used - 1] == 0) {
+    --(a->used);
+  }
+  if (a->used == 0) {
+    a->sign = MP_ZPOS;
+  }
+}
+
+/* End: bn_mp_clamp.c */
+
+/* Start: bn_mp_clear.c */
+#line 0 "bn_mp_clear.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* clear one (frees)  */
+void
+mp_clear (mp_int * a)
+{
+  if (a->dp != NULL) {
+
+    /* first zero the digits */
+    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+
+    /* free ram */
+    free (a->dp);
+
+    /* reset members to make debugging easier */
+    a->dp = NULL;
+    a->alloc = a->used = 0;
+  }
+}
+
+/* End: bn_mp_clear.c */
+
+/* Start: bn_mp_cmp.c */
+#line 0 "bn_mp_cmp.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare two ints (signed)*/
+int
+mp_cmp (mp_int * a, mp_int * b)
+{
+  /* compare based on sign */
+  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
+    return MP_LT;
+  } 
+  
+  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
+    return MP_GT;
+  }
+  
+  /* compare digits */
+  if (a->sign == MP_NEG) {
+     /* if negative compare opposite direction */
+     return mp_cmp_mag(b, a);
+  } else {
+     return mp_cmp_mag(a, b);
+  }
+}
+
+/* End: bn_mp_cmp.c */
+
+/* Start: bn_mp_cmp_d.c */
+#line 0 "bn_mp_cmp_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare a digit */
+int
+mp_cmp_d (mp_int * a, mp_digit b)
+{
+
+  if (a->sign == MP_NEG) {
+    return MP_LT;
+  }
+
+  if (a->used > 1) {
+    return MP_GT;
+  }
+
+  if (a->dp[0] > b) {
+    return MP_GT;
+  } else if (a->dp[0] < b) {
+    return MP_LT;
+  } else {
+    return MP_EQ;
+  }
+}
+
+/* End: bn_mp_cmp_d.c */
+
+/* Start: bn_mp_cmp_mag.c */
+#line 0 "bn_mp_cmp_mag.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* compare maginitude of two ints (unsigned) */
+int
+mp_cmp_mag (mp_int * a, mp_int * b)
+{
+  int     n;
+
+  /* compare based on # of non-zero digits */
+  if (a->used > b->used) {
+    return MP_GT;
+  } 
+  
+  if (a->used < b->used) {
+    return MP_LT;
+  }
+
+  /* compare based on digits  */
+  for (n = a->used - 1; n >= 0; n--) {
+    if (a->dp[n] > b->dp[n]) {
+      return MP_GT;
+    } 
+    
+    if (a->dp[n] < b->dp[n]) {
+      return MP_LT;
+    }
+  }
+  return MP_EQ;
+}
+
+/* End: bn_mp_cmp_mag.c */
+
+/* Start: bn_mp_copy.c */
+#line 0 "bn_mp_copy.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* copy, b = a */
+int
+mp_copy (mp_int * a, mp_int * b)
+{
+  int     res, n;
+
+  /* if dst == src do nothing */
+  if (a == b || a->dp == b->dp) {
+    return MP_OKAY;
+  }
+
+  /* grow dest */
+  if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero b and copy the parameters over */
+  {
+    register mp_digit *tmpa, *tmpb;
+
+    /* pointer aliases */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
+
+    /* clear high digits */
+    for (; n < b->used; n++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->used = a->used;
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_copy.c */
+
+/* Start: bn_mp_count_bits.c */
+#line 0 "bn_mp_count_bits.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* returns the number of bits in an int */
+int
+mp_count_bits (mp_int * a)
+{
+  int     r;
+  mp_digit q;
+
+  if (a->used == 0) {
+    return 0;
+  }
+
+  r = (a->used - 1) * DIGIT_BIT;
+  q = a->dp[a->used - 1];
+  while (q > ((mp_digit) 0)) {
+    ++r;
+    q >>= ((mp_digit) 1);
+  }
+  return r;
+}
+
+/* End: bn_mp_count_bits.c */
+
+/* Start: bn_mp_div.c */
+#line 0 "bn_mp_div.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder]
+ * HAC pp.598 Algorithm 14.20
+ *
+ * Note that the description in HAC is horribly incomplete.  For example,
+ * it doesn't consider the case where digits are removed from 'x' in the inner
+ * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
+ *
+ * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
+*/
+int
+mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;
+
+
+  /* is divisor zero ? */
+  if (mp_iszero (b) == 1) {
+    return MP_VAL;
+  }
+
+  /* if a < b then q=0, r = a */
+  if (mp_cmp_mag (a, b) == MP_LT) {
+    if (d != NULL) {
+      res = mp_copy (a, d);
+    } else {
+      res = MP_OKAY;
+    }
+    if (c != NULL) {
+      mp_zero (c);
+    }
+    return res;
+  }
+
+  if ((res = mp_init_size (&q, a->used + 2)) != MP_OKAY) {
+    return res;
+  }
+  q.used = a->used + 2;
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    goto __Q;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init_copy (&x, a)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  if ((res = mp_init_copy (&y, b)) != MP_OKAY) {
+    goto __X;
+  }
+
+  /* fix the sign */
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  x.sign = y.sign = MP_ZPOS;
+
+  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
+  norm = mp_count_bits(&y) % DIGIT_BIT;
+  if (norm < (int)(DIGIT_BIT-1)) {
+     norm = (DIGIT_BIT-1) - norm;
+     if ((res = mp_mul_2d (&x, norm, &x)) != MP_OKAY) {
+       goto __Y;
+     }
+     if ((res = mp_mul_2d (&y, norm, &y)) != MP_OKAY) {
+       goto __Y;
+     }
+  } else {
+     norm = 0;
+  }
+
+  /* note hac does 0 based, so if used==5 then its 0,1,2,3,4, e.g. use 4 */
+  n = x.used - 1;
+  t = y.used - 1;
+
+  /* step 2. while (x >= y*b^n-t) do { q[n-t] += 1; x -= y*b^{n-t} } */
+  if ((res = mp_lshd (&y, n - t)) != MP_OKAY) { /* y = y*b^{n-t} */
+    goto __Y;
+  }
+
+  while (mp_cmp (&x, &y) != MP_LT) {
+    ++(q.dp[n - t]);
+    if ((res = mp_sub (&x, &y, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+  }
+
+  /* reset y by shifting it back down */
+  mp_rshd (&y, n - t);
+
+  /* step 3. for i from n down to (t + 1) */
+  for (i = n; i >= (t + 1); i--) {
+    if (i > x.used)
+      continue;
+
+    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
+    if (x.dp[i] == y.dp[t]) {
+      q.dp[i - t - 1] = ((((mp_digit)1) << DIGIT_BIT) - 1);
+    } else {
+      mp_word tmp;
+      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
+      tmp |= ((mp_word) x.dp[i - 1]);
+      tmp /= ((mp_word) y.dp[t]);
+      if (tmp > (mp_word) MP_MASK)
+        tmp = MP_MASK;
+      q.dp[i - t - 1] = (mp_digit) (tmp & (mp_word) (MP_MASK));
+    }
+
+    /* step 3.2 while (q{i-t-1} * (yt * b + y{t-1})) > xi * b^2 + xi-1 * b + xi-2 do q{i-t-1} -= 1; */
+    q.dp[i - t - 1] = (q.dp[i - t - 1] + 1) & MP_MASK;
+    do {
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1) & MP_MASK;
+
+      /* find left hand */
+      mp_zero (&t1);
+      t1.dp[0] = (t - 1 < 0) ? 0 : y.dp[t - 1];
+      t1.dp[1] = y.dp[t];
+      t1.used = 2;
+      if ((res = mp_mul_d (&t1, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+        goto __Y;
+      }
+
+      /* find right hand */
+      t2.dp[0] = (i - 2 < 0) ? 0 : x.dp[i - 2];
+      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
+      t2.dp[2] = x.dp[i];
+      t2.used = 3;
+    } while (mp_cmp_mag(&t1, &t2) == MP_GT);
+
+    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
+    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    if ((res = mp_sub (&x, &t1, &x)) != MP_OKAY) {
+      goto __Y;
+    }
+
+    /* step 3.4 if x < 0 then { x = x + y*b^{i-t-1}; q{i-t-1} -= 1; } */
+    if (x.sign == MP_NEG) {
+      if ((res = mp_copy (&y, &t1)) != MP_OKAY) {
+        goto __Y;
+      }
+      if ((res = mp_lshd (&t1, i - t - 1)) != MP_OKAY) {
+        goto __Y;
+      }
+      if ((res = mp_add (&x, &t1, &x)) != MP_OKAY) {
+        goto __Y;
+      }
+
+      q.dp[i - t - 1] = (q.dp[i - t - 1] - 1UL) & MP_MASK;
+    }
+  }
+
+  /* now q is the quotient and x is the remainder [which we have to normalize] */
+  /* get sign before writing to c */
+  x.sign = a->sign;
+
+  if (c != NULL) {
+    mp_clamp (&q);
+    mp_exch (&q, c);
+    c->sign = neg;
+  }
+
+  if (d != NULL) {
+    mp_div_2d (&x, norm, &x, NULL);
+    mp_exch (&x, d);
+  }
+
+  res = MP_OKAY;
+
+__Y:mp_clear (&y);
+__X:mp_clear (&x);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+__Q:mp_clear (&q);
+  return res;
+}
+
+/* End: bn_mp_div.c */
+
+/* Start: bn_mp_div_2.c */
+#line 0 "bn_mp_div_2.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = a/2 */
+int
+mp_div_2 (mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* copy */
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* source alias */
+    tmpa = a->dp + b->used - 1;
+
+    /* dest alias */
+    tmpb = b->dp + b->used - 1;
+
+    /* carry */
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      /* get the carry for the next iteration */
+      rr = *tmpa & 1;
+
+      /* shift the current digit, add in carry and store */
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+
+      /* forward carry to next iteration */
+      r = rr;
+    }
+
+    /* zero excess digits */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  mp_clamp (b);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_div_2.c */
+
+/* Start: bn_mp_div_2d.c */
+#line 0 "bn_mp_div_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift right by a certain bit count (store quotient in c, remainder in d) */
+int
+mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+{
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;
+
+
+  /* if the shift count is <= 0 then we do no work */
+  if (b <= 0) {
+    res = mp_copy (a, c);
+    if (d != NULL) {
+      mp_zero (d);
+    }
+    return res;
+  }
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  /* get the remainder */
+  if (d != NULL) {
+    if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    mp_rshd (c, b / DIGIT_BIT);
+  }
+
+  /* shift any bit count < DIGIT_BIT */
+  D = (mp_digit) (b % DIGIT_BIT);
+  if (D != 0) {
+    register mp_digit *tmpc, mask;
+
+    /* mask */
+    mask = (((mp_digit)1) << D) - 1;
+
+    /* alias */
+    tmpc = c->dp + (c->used - 1);
+
+    /* carry */
+    r = 0;
+    for (x = c->used - 1; x >= 0; x--) {
+      /* get the lower  bits of this word in a temp */
+      rr = *tmpc & mask;
+
+      /* shift the current word and mix in the carry bits from the previous word */
+      *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+      --tmpc;
+
+      /* set the carry to the carry bits of the current word found above */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  res = MP_OKAY;
+  if (d != NULL) {
+    mp_exch (&t, d);
+  }
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_div_2d.c */
+
+/* Start: bn_mp_div_d.c */
+#line 0 "bn_mp_div_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit division */
+int
+mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
+{
+  mp_int  t, t2;
+  int     res;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  mp_set (&t, b);
+  res = mp_div (a, &t, c, &t2);
+
+  /* set remainder if not null */
+  if (d != NULL) {
+    *d = t2.dp[0];
+  }
+
+  mp_clear (&t);
+  mp_clear (&t2);
+  return res;
+}
+
+/* End: bn_mp_div_d.c */
+
+/* Start: bn_mp_dr_is_modulus.c */
+#line 0 "bn_mp_dr_is_modulus.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if a number is a valid DR modulus */
+int mp_dr_is_modulus(mp_int *a)
+{
+   int ix;
+
+   /* must be at least two digits */
+   if (a->used < 2) {
+      return 0;
+   }
+
+   for (ix = 1; ix < a->used; ix++) {
+       if (a->dp[ix] != MP_MASK) {
+          return 0;
+       }
+   }
+   return 1;
+}
+
+
+/* End: bn_mp_dr_is_modulus.c */
+
+/* Start: bn_mp_dr_reduce.c */
+#line 0 "bn_mp_dr_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reduce "a" in place modulo "b" using the Diminished Radix algorithm.
+ *
+ * Based on algorithm from the paper
+ *
+ * "Generating Efficient Primes for Discrete Log Cryptosystems"
+ *                 Chae Hoon Lim, Pil Loong Lee,
+ *          POSTECH Information Research Laboratories
+ *
+ * The modulus must be of a special format [see manual]
+ */
+int
+mp_dr_reduce (mp_int * a, mp_int * b, mp_digit mp)
+{
+  int     err, i, j, k;
+  mp_word r;
+  mp_digit mu, *tmpj, *tmpi;
+
+  /* k = digits in modulus */
+  k = b->used;
+
+  /* ensure that "a" has at least 2k digits */
+  if (a->alloc < k + k) {
+    if ((err = mp_grow (a, k + k)) != MP_OKAY) {
+      return err;
+    }
+  }
+
+  /* alias for a->dp[i] */
+  tmpi = a->dp + k + k - 1;
+
+  /* for (i = 2k - 1; i >= k; i = i - 1)
+   *
+   * This is the main loop of the reduction.  Note that at the end
+   * the words above position k are not zeroed as expected.  The end
+   * result is that the digits from 0 to k-1 are the residue.  So
+   * we have to clear those afterwards.
+   */
+  for (i = k + k - 1; i >= k; i = i - 1) {
+    /* x[i - 1 : i - k] += x[i]*mp */
+
+    /* x[i] * mp */
+    r = ((mp_word) *tmpi--) * ((mp_word) mp);
+
+    /* now add r to x[i-1:i-k]
+     *
+     * First add it to the first digit x[i-k] then form the carry
+     * then enter the main loop
+     */
+    j = i - k;
+
+    /* alias for a->dp[j] */
+    tmpj = a->dp + j;
+
+    /* add digit */
+    *tmpj += (mp_digit)(r & MP_MASK);
+
+    /* this is the carry */
+    mu = (r >> ((mp_word) DIGIT_BIT)) + (*tmpj >> DIGIT_BIT);
+
+    /* clear carry from a->dp[j]  */
+    *tmpj++ &= MP_MASK;
+
+    /* now add rest of the digits
+     *
+     * Note this is basically a simple single digit addition to
+     * a larger multiple digit number.  This is optimized somewhat
+     * because the propagation of carries is not likely to move
+     * more than a few digits.
+     *
+     */
+    for (++j; mu != 0 && j <= (i - 1); ++j) {
+      *tmpj   += mu;
+      mu       = *tmpj >> DIGIT_BIT;
+      *tmpj++ &= MP_MASK;
+    }
+
+    /* if final carry */
+    if (mu != 0) {
+      /* add mp to this to correct */
+      j = i - k;
+      tmpj = a->dp + j;
+
+      *tmpj += mp;
+      mu = *tmpj >> DIGIT_BIT;
+      *tmpj++ &= MP_MASK;
+
+      /* now handle carries */
+      for (++j; mu != 0 && j <= (i - 1); j++) {
+          *tmpj   += mu;
+          mu       = *tmpj >> DIGIT_BIT;
+          *tmpj++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* zero words above k */
+  tmpi = a->dp + k;
+  for (i = k; i < a->used; i++) {
+      *tmpi++ = 0;
+  }
+
+  /* clamp, sub and return */
+  mp_clamp (a);
+
+  /* if a >= b [b == modulus] then subtract the modulus to fix up */
+  if (mp_cmp_mag (a, b) != MP_LT) {
+    return s_mp_sub (a, b, a);
+  }
+  return MP_OKAY;
+}
+
+
+
+
+/* End: bn_mp_dr_reduce.c */
+
+/* Start: bn_mp_dr_setup.c */
+#line 0 "bn_mp_dr_setup.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines the setup value */
+void mp_dr_setup(mp_int *a, mp_digit *d)
+{
+   /* the casts are required if DIGIT_BIT is one less than
+    * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
+    */
+   *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - ((mp_word)a->dp[0]));
+}
+
+
+/* End: bn_mp_dr_setup.c */
+
+/* Start: bn_mp_exch.c */
+#line 0 "bn_mp_exch.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* swap the elements of two integers, for cases where you can't simply swap the 
+ * mp_int pointers around 
+ */
+void
+mp_exch (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+
+  t = *a;
+  *a = *b;
+  *b = t;
+}
+
+/* End: bn_mp_exch.c */
+
+/* Start: bn_mp_expt_d.c */
+#line 0 "bn_mp_expt_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calculate c = a^b  using a square-multiply algorithm */
+int
+mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, x;
+  mp_int  g;
+
+  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* set initial result */
+  mp_set (c, 1);
+
+  for (x = 0; x < (int) DIGIT_BIT; x++) {
+    /* square */
+    if ((res = mp_sqr (c, c)) != MP_OKAY) {
+      mp_clear (&g);
+      return res;
+    }
+
+    /* if the bit is set multiply */
+    if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) {
+      if ((res = mp_mul (c, &g, c)) != MP_OKAY) {
+         mp_clear (&g);
+         return res;
+      }
+    }
+
+    /* shift to next bit */
+    b <<= 1;
+  }
+
+  mp_clear (&g);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_expt_d.c */
+
+/* Start: bn_mp_exptmod.c */
+#line 0 "bn_mp_exptmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
+int
+mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  int dr;
+
+  /* modulus P must be positive */
+  if (P->sign == MP_NEG) {
+     return MP_VAL;
+  }
+
+  /* if exponent X is negative we have to recurse */
+  if (X->sign == MP_NEG) {
+     mp_int tmpG, tmpX;
+     int err;
+
+     /* first compute 1/G mod P */
+     if ((err = mp_init(&tmpG)) != MP_OKAY) {
+        return err;
+     }
+     if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+
+     /* now get |X| */
+     if ((err = mp_init(&tmpX)) != MP_OKAY) {
+        mp_clear(&tmpG);
+        return err;
+     }
+     if ((err = mp_abs(X, &tmpX)) != MP_OKAY) {
+        mp_clear_multi(&tmpG, &tmpX, NULL);
+        return err;
+     }
+
+     /* and now compute (1/G)^|X| instead of G^X [X < 0] */
+     err = mp_exptmod(&tmpG, &tmpX, P, Y);
+     mp_clear_multi(&tmpG, &tmpX, NULL);
+     return err;
+  }
+
+
+  dr = mp_dr_is_modulus(P);
+  /* if the modulus is odd use the fast method */
+  if ((mp_isodd (P) == 1 || dr == 1) && P->used > 4) {
+    return mp_exptmod_fast (G, X, P, Y, dr);
+  } else {
+    return f_mp_exptmod (G, X, P, Y);
+  }
+}
+
+static int
+f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[256], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+    if (winsize > 5) {
+       winsize = 5;
+    }
+#endif
+
+  /* init G array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  /* create mu, used for Barrett reduction */
+  if ((err = mp_init (&mu)) != MP_OKAY) {
+    goto __M;
+  }
+  if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __MU;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __MU;
+    }
+    if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) {
+      goto __MU;
+    }
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __MU;
+  }
+  mp_set (&res, 1);
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0)
+      continue;
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __MU;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __MU;
+      }
+
+      /* empty window and reset */
+      bitcpy = bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__MU:mp_clear (&mu);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+
+/* End: bn_mp_exptmod.c */
+
+/* Start: bn_mp_exptmod_fast.c */
+#line 0 "bn_mp_exptmod_fast.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85
+ *
+ * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
+ * The value of k changes based on the size of the exponent.
+ *
+ * Uses Montgomery or Diminished Radix reduction [whichever appropriate]
+ */
+int
+mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
+{
+  mp_int  M[256], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  int     (*redux)(mp_int*,mp_int*,mp_digit);
+
+  /* find window size */
+  x = mp_count_bits (X);
+  if (x <= 7) {
+    winsize = 2;
+  } else if (x <= 36) {
+    winsize = 3;
+  } else if (x <= 140) {
+    winsize = 4;
+  } else if (x <= 450) {
+    winsize = 5;
+  } else if (x <= 1303) {
+    winsize = 6;
+  } else if (x <= 3529) {
+    winsize = 7;
+  } else {
+    winsize = 8;
+  }
+
+#ifdef MP_LOW_MEM
+  if (winsize > 5) {
+     winsize = 5;
+  }
+#endif
+
+
+  /* init G array */
+  for (x = 0; x < (1 << winsize); x++) {
+    if ((err = mp_init (&M[x])) != MP_OKAY) {
+      for (y = 0; y < x; y++) {
+        mp_clear (&M[y]);
+      }
+      return err;
+    }
+  }
+
+  if (redmode == 0) {
+     /* now setup montgomery  */
+     if ((err = mp_montgomery_setup (P, &mp)) != MP_OKAY) {
+        goto __M;
+     }
+     
+     /* automatically pick the comba one if available (saves quite a few calls/ifs) */
+     if ( ((P->used * 2 + 1) < MP_WARRAY) &&
+          P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+        redux = fast_mp_montgomery_reduce;
+     } else {
+        /* use slower baselien method */
+        redux = mp_montgomery_reduce;
+     }
+  } else {
+     /* setup DR reduction */
+     mp_dr_setup(P, &mp);
+     redux = mp_dr_reduce;
+  }
+
+  /* setup result */
+  if ((err = mp_init (&res)) != MP_OKAY) {
+    goto __RES;
+  }
+
+  /* create M table
+   *
+   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
+   *
+   * The first half of the table is not computed though accept for M[0] and M[1]
+   */
+
+  if (redmode == 0) {
+     /* now we need R mod m */
+     if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
+       goto __RES;
+     }
+
+     /* now set M[1] to G * R mod m */
+     if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
+       goto __RES;
+     }
+  } else {
+     mp_set(&res, 1);
+     if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
+        goto __RES;
+     }
+  }
+
+  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
+  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
+    goto __RES;
+  }
+
+  for (x = 0; x < (winsize - 1); x++) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err = redux (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* create upper table */
+  for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) {
+    if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) {
+      goto __RES;
+    }
+    if ((err = redux (&M[x], P, mp)) != MP_OKAY) {
+      goto __RES;
+    }
+  }
+
+  /* set initial mode and bit cnt */
+  mode   = 0;
+  bitcnt = 1;
+  buf    = 0;
+  digidx = X->used - 1;
+  bitcpy = bitbuf = 0;
+
+  for (;;) {
+    /* grab next digit as required */
+    if (--bitcnt == 0) {
+      if (digidx == -1) {
+        break;
+      }
+      buf = X->dp[digidx--];
+      bitcnt = (int) DIGIT_BIT;
+    }
+
+    /* grab the next msb from the exponent */
+    y = (mp_digit)(buf >> (DIGIT_BIT - 1)) & 1;
+    buf <<= (mp_digit)1;
+
+    /* if the bit is zero and mode == 0 then we ignore it
+     * These represent the leading zero bits before the first 1 bit
+     * in the exponent.  Technically this opt is not required but it
+     * does lower the # of trivial squaring/reductions used
+     */
+    if (mode == 0 && y == 0) {
+      continue;
+    }
+
+    /* if the bit is zero and mode == 1 then we square */
+    if (mode == 1 && y == 0) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+      continue;
+    }
+
+    /* else we add it to the window */
+    bitbuf |= (y << (winsize - ++bitcpy));
+    mode = 2;
+
+    if (bitcpy == winsize) {
+      /* ok window is filled so square as required and multiply  */
+      /* square first */
+      for (x = 0; x < winsize; x++) {
+        if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+
+      /* then multiply */
+      if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      /* empty window and reset */
+      bitcpy = bitbuf = 0;
+      mode = 1;
+    }
+  }
+
+  /* if bits remain then square/multiply */
+  if (mode == 2 && bitcpy > 0) {
+    /* square then multiply if the bit is set */
+    for (x = 0; x < bitcpy; x++) {
+      if ((err = mp_sqr (&res, &res)) != MP_OKAY) {
+        goto __RES;
+      }
+      if ((err = redux (&res, P, mp)) != MP_OKAY) {
+        goto __RES;
+      }
+
+      bitbuf <<= 1;
+      if ((bitbuf & (1 << winsize)) != 0) {
+        /* then multiply */
+        if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) {
+          goto __RES;
+        }
+        if ((err = redux (&res, P, mp)) != MP_OKAY) {
+          goto __RES;
+        }
+      }
+    }
+  }
+
+  if (redmode == 0) {
+     /* fixup result */
+     if ((err = mp_montgomery_reduce (&res, P, mp)) != MP_OKAY) {
+       goto __RES;
+     }
+  }
+
+  mp_exch (&res, Y);
+  err = MP_OKAY;
+__RES:mp_clear (&res);
+__M:
+  for (x = 0; x < (1 << winsize); x++) {
+    mp_clear (&M[x]);
+  }
+  return err;
+}
+
+/* End: bn_mp_exptmod_fast.c */
+
+/* Start: bn_mp_gcd.c */
+#line 0 "bn_mp_gcd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Greatest Common Divisor using the binary method [Algorithm B, page 338, vol2 of TAOCP]
+ */
+int
+mp_gcd (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  u, v, t;
+  int     k, res, neg;
+
+  /* either zero than gcd is the largest */
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 0) {
+    return mp_copy (b, c);
+  }
+  if (mp_iszero (a) == 0 && mp_iszero (b) == 1) {
+    return mp_copy (a, c);
+  }
+  if (mp_iszero (a) == 1 && mp_iszero (b) == 1) {
+    mp_set (c, 1);
+    return MP_OKAY;
+  }
+
+  /* if both are negative they share (-1) as a common divisor */
+  neg = (a->sign == b->sign) ? a->sign : MP_ZPOS;
+
+  if ((res = mp_init_copy (&u, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init_copy (&v, b)) != MP_OKAY) {
+    goto __U;
+  }
+
+  /* must be positive for the remainder of the algorithm */
+  u.sign = v.sign = MP_ZPOS;
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    goto __V;
+  }
+
+  /* B1.  Find power of two */
+  k = 0;
+  while (mp_iseven(&u) == 1 && mp_iseven(&v) == 1) {
+    ++k;
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __T;
+    }
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  /* B2.  Initialize */
+  if (mp_isodd(&u) == 1) {
+    /* t = -v */
+    if ((res = mp_copy (&v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+    t.sign = MP_NEG;
+  } else {
+    /* t = u */
+    if ((res = mp_copy (&u, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  }
+
+  do {
+    /* B3 (and B4).  Halve t, if even */
+    while (t.used != 0 && mp_iseven(&t) == 1) {
+      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
+        goto __T;
+      }
+    }
+
+    /* B5.  if t>0 then u=t otherwise v=-t */
+    if (t.used != 0 && t.sign != MP_NEG) {
+      if ((res = mp_copy (&t, &u)) != MP_OKAY) {
+        goto __T;
+      }
+    } else {
+      if ((res = mp_copy (&t, &v)) != MP_OKAY) {
+        goto __T;
+      }
+      v.sign = (v.sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+    }
+
+    /* B6.  t = u - v, if t != 0 loop otherwise terminate */
+    if ((res = mp_sub (&u, &v, &t)) != MP_OKAY) {
+      goto __T;
+    }
+  } while (mp_iszero(&t) == 0);
+
+  /* multiply by 2^k which we divided out at the beginning */ 
+  if ((res = mp_mul_2d (&u, k, &u)) != MP_OKAY) {
+    goto __T;
+  }
+
+  mp_exch (&u, c);
+  c->sign = neg;
+  res = MP_OKAY;
+__T:mp_clear (&t);
+__V:mp_clear (&u);
+__U:mp_clear (&v);
+  return res;
+}
+
+/* End: bn_mp_gcd.c */
+
+/* Start: bn_mp_grow.c */
+#line 0 "bn_mp_grow.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* grow as required */
+int
+mp_grow (mp_int * a, int size)
+{
+  int     i;
+
+  /* if the alloc size is smaller alloc more ram */
+  if (a->alloc < size) {
+    /* ensure there are always at least MP_PREC digits extra on top */
+    size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
+
+    a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
+    if (a->dp == NULL) {
+      return MP_MEM;
+    }
+
+    /* zero excess digits */
+    i        = a->alloc;
+    a->alloc = size;
+    for (; i < a->alloc; i++) {
+      a->dp[i] = 0;
+    }
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_grow.c */
+
+/* Start: bn_mp_init.c */
+#line 0 "bn_mp_init.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with 
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* init a new bigint */
+int
+mp_init (mp_int * a)
+{
+  /* allocate ram required and clear it */
+  a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+
+  /* set the used to zero, allocated digit to the default precision
+   * and sign to positive */
+  a->used  = 0;
+  a->alloc = MP_PREC;
+  a->sign  = MP_ZPOS;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_init.c */
+
+/* Start: bn_mp_init_copy.c */
+#line 0 "bn_mp_init_copy.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* creates "a" then copies b into it */
+int
+mp_init_copy (mp_int * a, mp_int * b)
+{
+  int     res;
+
+  if ((res = mp_init (a)) != MP_OKAY) {
+    return res;
+  }
+  return mp_copy (b, a);
+}
+
+/* End: bn_mp_init_copy.c */
+
+/* Start: bn_mp_init_size.c */
+#line 0 "bn_mp_init_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* init a mp_init and grow it to a given size */
+int
+mp_init_size (mp_int * a, int size)
+{
+
+  /* pad size so there are always extra digits */
+  size += (MP_PREC * 2) - (size & (MP_PREC - 1));	
+  
+  /* alloc mem */
+  a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
+  if (a->dp == NULL) {
+    return MP_MEM;
+  }
+  a->used = 0;
+  a->alloc = size;
+  a->sign = MP_ZPOS;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_init_size.c */
+
+/* Start: bn_mp_invmod.c */
+#line 0 "bn_mp_invmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int
+mp_invmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;
+
+  /* b cannot be negative */
+  if (b->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* if the modulus is odd we can use a faster routine instead */
+  if (mp_iseven (b) == 0) {
+    return fast_mp_invmod (a, b, c);
+  }
+  
+  /* init temps */
+  if ((res = mp_init_multi(&x, &y, &u, &v, &A, &B, &C, &D, NULL)) != MP_OKAY) {
+     return res;
+  }
+
+  /* x = a, y = b */
+  if ((res = mp_copy (a, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (b, &y)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  if ((res = mp_abs (&x, &x)) != MP_OKAY) {
+    goto __ERR;
+  }
+
+  /* 2. [modified] if x,y are both even then return an error! */
+  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* 3. u=x, v=y, A=1, B=0, C=0,D=1 */
+  if ((res = mp_copy (&x, &u)) != MP_OKAY) {
+    goto __ERR;
+  }
+  if ((res = mp_copy (&y, &v)) != MP_OKAY) {
+    goto __ERR;
+  }
+  mp_set (&A, 1);
+  mp_set (&D, 1);
+
+
+top:
+  /* 4.  while u is even do */
+  while (mp_iseven (&u) == 1) {
+    /* 4.1 u = u/2 */
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 4.2 if A or B is odd then */
+    if (mp_iseven (&A) == 0 || mp_iseven (&B) == 0) {
+      /* A = (A+y)/2, B = (B-x)/2 */
+      if ((res = mp_add (&A, &y, &A)) != MP_OKAY) {
+	goto __ERR;
+      }
+      if ((res = mp_sub (&B, &x, &B)) != MP_OKAY) {
+	goto __ERR;
+      }
+    }
+    /* A = A/2, B = B/2 */
+    if ((res = mp_div_2 (&A, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&B, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+
+  /* 5.  while v is even do */
+  while (mp_iseven (&v) == 1) {
+    /* 5.1 v = v/2 */
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+    /* 5.2 if C,D are even then */
+    if (mp_iseven (&C) == 0 || mp_iseven (&D) == 0) {
+      /* C = (C+y)/2, D = (D-x)/2 */
+      if ((res = mp_add (&C, &y, &C)) != MP_OKAY) {
+	goto __ERR;
+      }
+      if ((res = mp_sub (&D, &x, &D)) != MP_OKAY) {
+	goto __ERR;
+      }
+    }
+    /* C = C/2, D = D/2 */
+    if ((res = mp_div_2 (&C, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+    if ((res = mp_div_2 (&D, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* 6.  if u >= v then */
+  if (mp_cmp (&u, &v) != MP_LT) {
+    /* u = u - v, A = A - C, B = B - D */
+    if ((res = mp_sub (&u, &v, &u)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&A, &C, &A)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&B, &D, &B)) != MP_OKAY) {
+      goto __ERR;
+    }
+  } else {
+    /* v - v - u, C = C - A, D = D - B */
+    if ((res = mp_sub (&v, &u, &v)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&C, &A, &C)) != MP_OKAY) {
+      goto __ERR;
+    }
+
+    if ((res = mp_sub (&D, &B, &D)) != MP_OKAY) {
+      goto __ERR;
+    }
+  }
+
+  /* if not zero goto step 4 */
+  if (mp_iszero (&u) == 0)
+    goto top;
+
+  /* now a = C, b = D, gcd == g*v */
+
+  /* if v != 1 then there is no inverse */
+  if (mp_cmp_d (&v, 1) != MP_EQ) {
+    res = MP_VAL;
+    goto __ERR;
+  }
+
+  /* a is now the inverse */
+  mp_exch (&C, c);
+  res = MP_OKAY;
+
+__ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);
+  return res;
+}
+
+/* End: bn_mp_invmod.c */
+
+/* Start: bn_mp_jacobi.c */
+#line 0 "bn_mp_jacobi.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes the jacobi c = (a | n) (or Legendre if n is prime)
+ * HAC pp. 73 Algorithm 2.149
+ */
+int
+mp_jacobi (mp_int * a, mp_int * n, int *c)
+{
+  mp_int  a1, n1, e;
+  int     s, r, res;
+  mp_digit residue;
+
+  /* step 1.  if a == 0, return 0 */
+  if (mp_iszero (a) == 1) {
+    *c = 0;
+    return MP_OKAY;
+  }
+
+  /* step 2.  if a == 1, return 1 */
+  if (mp_cmp_d (a, 1) == MP_EQ) {
+    *c = 1;
+    return MP_OKAY;
+  }
+
+  /* default */
+  s = 0;
+
+  /* step 3.  write a = a1 * 2^e  */
+  if ((res = mp_init_copy (&a1, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&n1)) != MP_OKAY) {
+    goto __A1;
+  }
+
+  if ((res = mp_init (&e)) != MP_OKAY) {
+    goto __N1;
+  }
+
+  while (mp_iseven (&a1) == 1) {
+    if ((res = mp_add_d (&e, 1, &e)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if ((res = mp_div_2 (&a1, &a1)) != MP_OKAY) {
+      goto __E;
+    }
+  }
+
+  /* step 4.  if e is even set s=1 */
+  if (mp_iseven (&e) == 1) {
+    s = 1;
+  } else {
+    /* else set s=1 if n = 1/7 (mod 8) or s=-1 if n = 3/5 (mod 8) */
+    if ((res = mp_mod_d (n, 8, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+
+    if (residue == 1 || residue == 7) {
+      s = 1;
+    } else if (residue == 3 || residue == 5) {
+      s = -1;
+    }
+  }
+
+  /* step 5.  if n == 3 (mod 4) *and* a1 == 3 (mod 4) then s = -s */
+  if ((res = mp_mod_d (n, 4, &residue)) != MP_OKAY) {
+    goto __E;
+  }
+  if (residue == 3) {
+    if ((res = mp_mod_d (&a1, 4, &residue)) != MP_OKAY) {
+      goto __E;
+    }
+    if (residue == 3) {
+      s = -s;
+    }
+  }
+
+  /* if a1 == 1 we're done */
+  if (mp_cmp_d (&a1, 1) == MP_EQ) {
+    *c = s;
+  } else {
+    /* n1 = n mod a1 */
+    if ((res = mp_mod (n, &a1, &n1)) != MP_OKAY) {
+      goto __E;
+    }
+    if ((res = mp_jacobi (&n1, &a1, &r)) != MP_OKAY) {
+      goto __E;
+    }
+    *c = s * r;
+  }
+
+  /* done */
+  res = MP_OKAY;
+__E:mp_clear (&e);
+__N1:mp_clear (&n1);
+__A1:mp_clear (&a1);
+  return res;
+}
+
+/* End: bn_mp_jacobi.c */
+
+/* Start: bn_mp_karatsuba_mul.c */
+#line 0 "bn_mp_karatsuba_mul.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = |a| * |b| using Karatsuba Multiplication using three half size multiplications
+ *
+ * Let B represent the radix [e.g. 2**DIGIT_BIT] and let n represent half of the number of digits in the min(a,b)
+ *
+ * a = a1 * B^n + a0
+ * b = b1 * B^n + b0
+ *
+ * Then, a * b => a1b1 * B^2n + ((a1 - b1)(a0 - b0) + a0b0 + a1b1) * B + a0b0
+ *
+ * Note that a1b1 and a0b0 are used twice and only need to be computed once.  So in total
+ * three half size (half # of digit) multiplications are performed, a0b0, a1b1 and (a1-b1)(a0-b0)
+ *
+ * Note that a multiplication of half the digits requires 1/4th the number of single precision 
+ * multiplications so in total after one call 25% of the single precision multiplications are saved.
+ * Note also that the call to mp_mul can end up back in this function if the a0, a1, b0, or b1 are above
+ * the threshold.  This is known as divide-and-conquer and leads to the famous O(N^lg(3)) or O(N^1.584) work which
+ * is asymptopically lower than the standard O(N^2) that the baseline/comba methods use.  Generally though the 
+ * overhead of this method doesn't pay off until a certain size (N ~ 80) is reached.
+ */
+int
+mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
+  int     B, err;
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = MIN (a->used, b->used);
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+  if (mp_init_size (&y0, B) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&y1, b->used - B) != MP_OKAY)
+    goto Y0;
+
+  /* init temps */
+  if (mp_init_size (&t1, B * 2) != MP_OKAY)
+    goto Y1;
+  if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
+    goto X0Y0;
+
+  /* now shift the digits */
+  x0.sign = x1.sign = a->sign;
+  y0.sign = y1.sign = b->sign;
+
+  x0.used = y0.used = B;
+  x1.used = a->used - B;
+  y1.used = b->used - B;
+
+  {
+    register int x;
+    register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
+
+    /* we copy the digits directly instead of using higher level functions
+     * since we also need to shift the digits
+     */
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    tmpx = x0.dp;
+    tmpy = y0.dp;
+    for (x = 0; x < B; x++) {
+      *tmpx++ = *tmpa++;
+      *tmpy++ = *tmpb++;
+    }
+
+    tmpx = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *tmpx++ = *tmpa++;
+    }
+
+    tmpy = y1.dp;
+    for (x = B; x < b->used; x++) {
+      *tmpy++ = *tmpb++;
+    }
+  }
+
+  /* only need to clamp the lower words since by definition the upper words x1/y1 must
+   * have a known number of digits
+   */
+  mp_clamp (&x0);
+  mp_clamp (&y0);
+
+  /* now calc the products x0y0 and x1y1 */
+  if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  /* after this x0 is no longer required, free temp [x0==t2]! */
+    goto X1Y1;          /* x0y0 = x0*y0 */
+  if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1*y1 */
+
+  /* now calc x1-x0 and y1-y0 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x1 - x0 */
+  if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = y1 - y0 */
+  if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
+
+  /* add x0y0 */
+  if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
+    goto X1Y1;          /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
+    goto X1Y1;          /* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1y1, c) != MP_OKAY)
+    goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
+
+  err = MP_OKAY;
+
+X1Y1:mp_clear (&x1y1);
+X0Y0:mp_clear (&x0y0);
+T1:mp_clear (&t1);
+Y1:mp_clear (&y1);
+Y0:mp_clear (&y0);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+
+/* End: bn_mp_karatsuba_mul.c */
+
+/* Start: bn_mp_karatsuba_sqr.c */
+#line 0 "bn_mp_karatsuba_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Karatsuba squaring, computes b = a*a using three half size squarings
+ *
+ * See comments of mp_karatsuba_mul for details.  It is essentially the same algorithm
+ * but merely tuned to perform recursive squarings.
+ */
+int
+mp_karatsuba_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err;
+
+  err = MP_MEM;
+
+  /* min # of digits */
+  B = a->used;
+
+  /* now divide in two */
+  B = B / 2;
+
+  /* init copy all the temps */
+  if (mp_init_size (&x0, B) != MP_OKAY)
+    goto ERR;
+  if (mp_init_size (&x1, a->used - B) != MP_OKAY)
+    goto X0;
+
+  /* init temps */
+  if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
+    goto X1;
+  if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
+    goto T1;
+  if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
+    goto T2;
+  if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
+    goto X0X0;
+
+  {
+    register int x;
+    register mp_digit *dst, *src;
+
+    src = a->dp;
+
+    /* now shift the digits */
+    dst = x0.dp;
+    for (x = 0; x < B; x++) {
+      *dst++ = *src++;
+    }
+
+    dst = x1.dp;
+    for (x = B; x < a->used; x++) {
+      *dst++ = *src++;
+    }
+  }
+
+  x0.used = B;
+  x1.used = a->used - B;
+
+  mp_clamp (&x0);
+
+  /* now calc the products x0*x0 and x1*x1 */
+  if (mp_sqr (&x0, &x0x0) != MP_OKAY)
+    goto X1X1;                  /* x0x0 = x0*x0 */
+  if (mp_sqr (&x1, &x1x1) != MP_OKAY)
+    goto X1X1;                  /* x1x1 = x1*x1 */
+
+  /* now calc (x1-x0)^2 */
+  if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
+    goto X1X1;                  /* t1 = x1 - x0 */
+  if (mp_sqr (&t1, &t1) != MP_OKAY)
+    goto X1X1;                  /* t1 = (x1 - x0) * (x1 - x0) */
+
+  /* add x0y0 */
+  if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
+    goto X1X1;                  /* t2 = x0y0 + x1y1 */
+  if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
+    goto X1X1;                  /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
+
+  /* shift by B */
+  if (mp_lshd (&t1, B) != MP_OKAY)
+    goto X1X1;                  /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
+  if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
+    goto X1X1;                  /* x1y1 = x1y1 << 2*B */
+
+  if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
+    goto X1X1;                  /* t1 = x0y0 + t1 */
+  if (mp_add (&t1, &x1x1, b) != MP_OKAY)
+    goto X1X1;                  /* t1 = x0y0 + t1 + x1y1 */
+
+  err = MP_OKAY;
+
+X1X1:mp_clear (&x1x1);
+X0X0:mp_clear (&x0x0);
+T2:mp_clear (&t2);
+T1:mp_clear (&t1);
+X1:mp_clear (&x1);
+X0:mp_clear (&x0);
+ERR:
+  return err;
+}
+
+/* End: bn_mp_karatsuba_sqr.c */
+
+/* Start: bn_mp_lcm.c */
+#line 0 "bn_mp_lcm.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes least common multiple as a*b/(a, b) */
+int
+mp_lcm (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  res = mp_div (&t, c, c, NULL);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_lcm.c */
+
+/* Start: bn_mp_lshd.c */
+#line 0 "bn_mp_lshd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift left a certain amount of digits */
+int
+mp_lshd (mp_int * a, int b)
+{
+  int     x, res;
+
+  /* if its less than zero return */
+  if (b <= 0) {
+    return MP_OKAY;
+  }
+
+  /* grow to fit the new digits */
+  if (a->alloc < a->used + b) {
+     if ((res = mp_grow (a, a->used + b)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  {
+    register mp_digit *tmpa, *tmpaa;
+
+    /* increment the used by the shift amount than copy upwards */
+    a->used += b;
+
+    /* top */
+    tmpa = a->dp + a->used - 1;
+
+    /* base */
+    tmpaa = a->dp + a->used - 1 - b;
+
+    /* much like mp_rshd this is implemented using a sliding window
+     * except the window goes the otherway around.  Copying from
+     * the bottom to the top.  see bn_mp_rshd.c for more info.
+     */
+    for (x = a->used - 1; x >= b; x--) {
+      *tmpa-- = *tmpaa--;
+    }
+
+    /* zero the lower digits */
+    tmpa = a->dp;
+    for (x = 0; x < b; x++) {
+      *tmpa++ = 0;
+    }
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_lshd.c */
+
+/* Start: bn_mp_mod.c */
+#line 0 "bn_mp_mod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = a mod b, 0 <= c < b */
+int
+mp_mod (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_div (a, b, NULL, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    res = mp_add (b, &t, c);
+  } else {
+    res = MP_OKAY;
+    mp_exch (&t, c);
+  }
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_mod.c */
+
+/* Start: bn_mp_mod_2d.c */
+#line 0 "bn_mp_mod_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calc a value mod 2^b */
+int
+mp_mod_2d (mp_int * a, int b, mp_int * c)
+{
+  int     x, res;
+
+
+  /* if b is <= 0 then zero the int */
+  if (b <= 0) {
+    mp_zero (c);
+    return MP_OKAY;
+  }
+
+  /* if the modulus is larger than the value than return */
+  if (b > (int) (a->used * DIGIT_BIT)) {
+    res = mp_copy (a, c);
+    return res;
+  }
+
+  /* copy */
+  if ((res = mp_copy (a, c)) != MP_OKAY) {
+    return res;
+  }
+
+  /* zero digits above the last digit of the modulus */
+  for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x++) {
+    c->dp[x] = 0;
+  }
+  /* clear the digit that is not completely outside/inside the modulus */
+  c->dp[b / DIGIT_BIT] &=
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mod_2d.c */
+
+/* Start: bn_mp_mod_d.c */
+#line 0 "bn_mp_mod_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+int
+mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
+{
+  mp_int  t, t2;
+  int     res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+
+  mp_set (&t, b);
+  mp_div (a, &t, NULL, &t2);
+
+  if (t2.sign == MP_NEG) {
+    if ((res = mp_add_d (&t2, b, &t2)) != MP_OKAY) {
+      mp_clear (&t);
+      mp_clear (&t2);
+      return res;
+    }
+  }
+  *c = t2.dp[0];
+  mp_clear (&t);
+  mp_clear (&t2);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mod_d.c */
+
+/* Start: bn_mp_montgomery_calc_normalization.c */
+#line 0 "bn_mp_montgomery_calc_normalization.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* calculates a = B^n mod b for Montgomery reduction
+ * Where B is the base [e.g. 2^DIGIT_BIT].
+ * B^n mod b is computed by first computing
+ * A = B^(n-1) which doesn't require a reduction but a simple OR.
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int
+mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  /* compute A = B^(n-1) * 2^(bits-1) */
+  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < (int)DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+        return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_calc_normalization.c */
+
+/* Start: bn_mp_montgomery_reduce.c */
+#line 0 "bn_mp_montgomery_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int
+mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
+{
+  int     ix, res, digs;
+  mp_digit ui;
+
+  /* can the fast reduction [comba] method be used?
+   *
+   * Note that unlike in mp_mul you're safely allowed *less*
+   * than the available columns [255 per default] since carries
+   * are fixed up in the inner loop.
+   */
+  digs = m->used * 2 + 1;
+  if ((digs < MP_WARRAY)
+      && m->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_mp_montgomery_reduce (a, m, mp);
+  }
+
+  /* grow the input as required */
+  if (a->alloc < m->used * 2 + 1) {
+    if ((res = mp_grow (a, m->used * 2 + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+  a->used = m->used * 2 + 1;
+
+  for (ix = 0; ix < m->used; ix++) {
+    /* ui = ai * m' mod b */
+    ui = (a->dp[ix] * mp) & MP_MASK;
+
+    /* a = a + ui * m * b^i */
+    {
+      register int iy;
+      register mp_digit *tmpx, *tmpy, mu;
+      register mp_word r;
+
+      /* aliases */
+      tmpx = m->dp;
+      tmpy = a->dp + ix;
+
+      mu = 0;
+      for (iy = 0; iy < m->used; iy++) {
+        r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
+        mu = (r >> ((mp_word) DIGIT_BIT));
+        *tmpy++ = (r & ((mp_word) MP_MASK));
+      }
+      /* propagate carries */
+      while (mu) {
+        *tmpy += mu;
+        mu = (*tmpy >> DIGIT_BIT) & 1;
+        *tmpy++ &= MP_MASK;
+      }
+    }
+  }
+
+  /* A = A/b^n */
+  mp_rshd (a, m->used);
+
+  /* if A >= m then A = A - m */
+  if (mp_cmp_mag (a, m) != MP_LT) {
+    return s_mp_sub (a, m, a);
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_reduce.c */
+
+/* Start: bn_mp_montgomery_setup.c */
+#line 0 "bn_mp_montgomery_setup.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* setups the montgomery reduction stuff */
+int
+mp_montgomery_setup (mp_int * a, mp_digit * mp)
+{
+  mp_digit x, b;
+
+/* fast inversion mod 2^k
+ *
+ * Based on the fact that
+ *
+ * XA = 1 (mod 2^n)  =>  (X(2-XA)) A = 1 (mod 2^2n)
+ *                   =>  2*X*A - X*X*A*A = 1
+ *                   =>  2*(1) - (1)     = 1
+ */
+  b = a->dp[0];
+
+  if ((b & 1) == 0) {
+    return MP_VAL;
+  }
+
+  x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2^4 */
+  x *= 2 - b * x;               /* here x*a==1 mod 2^8 */
+#if !defined(MP_8BIT)
+  x *= 2 - b * x;               /* here x*a==1 mod 2^16; each step doubles the nb of bits */
+#endif
+#if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
+  x *= 2 - b * x;               /* here x*a==1 mod 2^32 */
+#endif
+#ifdef MP_64BIT
+  x *= 2 - b * x;               /* here x*a==1 mod 2^64 */
+#endif
+
+  /* t = -1/m mod b */
+  *mp = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_montgomery_setup.c */
+
+/* Start: bn_mp_mul.c */
+#line 0 "bn_mp_mul.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level multiplication (handles sign) */
+int
+mp_mul (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, neg;
+  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
+  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
+    res = mp_karatsuba_mul (a, b, c);
+  } else {
+
+    /* can we use the fast multiplier?
+     *
+     * The fast multiplier can be used if the output will have less than
+     * MP_WARRAY digits and the number of digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+    if ((digs < MP_WARRAY)
+        && MIN(a->used, b->used) <= (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else {
+      res = s_mp_mul (a, b, c);
+    }
+
+  }
+  c->sign = neg;
+  return res;
+}
+
+/* End: bn_mp_mul.c */
+
+/* Start: bn_mp_mul_2.c */
+#line 0 "bn_mp_mul_2.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = a*2 */
+int
+mp_mul_2 (mp_int * a, mp_int * b)
+{
+  int     x, res, oldused;
+
+  /* grow to accomodate result */
+  if (b->alloc < a->used + 1) {
+    if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  oldused = b->used;
+  b->used = a->used;
+
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    /* alias for source */
+    tmpa = a->dp;
+    
+    /* alias for dest */
+    tmpb = b->dp;
+
+    /* carry */
+    r = 0;
+    for (x = 0; x < a->used; x++) {
+    
+      /* get what will be the *next* carry bit from the 
+       * MSB of the current digit 
+       */
+      rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+      
+      /* now shift up this digit, add in the carry [from the previous] */
+      *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+      
+      /* copy the carry that would be from the source 
+       * digit into the next iteration 
+       */
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      /* add a MSB which is always 1 at this point */
+      *tmpb = 1;
+      ++b->used;
+    }
+
+    /* now zero any excess digits on the destination 
+     * that we didn't write to 
+     */
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
+  }
+  b->sign = a->sign;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_2.c */
+
+/* Start: bn_mp_mul_2d.c */
+#line 0 "bn_mp_mul_2d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* NOTE:  This routine requires updating.  For instance the c->used = c->alloc bit
+   is wrong.  We should just shift c->used digits then set the carry as c->dp[c->used] = carry
+ 
+   To be fixed for LTM 0.18
+ */
+
+/* shift left by a certain bit count */
+int
+mp_mul_2d (mp_int * a, int b, mp_int * c)
+{
+  mp_digit d;
+  int      res;
+
+  /* copy */
+  if (a != c) {
+     if ((res = mp_copy (a, c)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) {
+     if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) {
+       return res;
+     }
+  }
+
+  /* shift by as many digits in the bit count */
+  if (b >= (int)DIGIT_BIT) {
+    if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) {
+      return res;
+    }
+  }
+  c->used = c->alloc;
+
+  /* shift any bit count < DIGIT_BIT */
+  d = (mp_digit) (b % DIGIT_BIT);
+  if (d != 0) {
+    register mp_digit *tmpc, mask, r, rr;
+    register int x;
+
+    /* bitmask for carries */
+    mask = (((mp_digit)1) << d) - 1;
+
+    /* alias */
+    tmpc = c->dp;
+
+    /* carry */
+    r    = 0;
+    for (x = 0; x < c->used; x++) {
+      /* get the higher bits of the current word */
+      rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+
+      /* shift the current word and OR in the carry */
+      *tmpc = ((*tmpc << d) | r) & MP_MASK;
+      ++tmpc;
+
+      /* set the carry to the carry bits of the current word */
+      r = rr;
+    }
+  }
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_2d.c */
+
+/* Start: bn_mp_mul_d.c */
+#line 0 "bn_mp_mul_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiply by a digit */
+int
+mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  int     res, pa, olduse;
+
+  /* make sure c is big enough to hold a*b */
+  pa = a->used;
+  if (c->alloc < pa + 1) {
+    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get the original destinations used count */
+  olduse = c->used;
+
+  /* set the new temporary used count */
+  c->used = pa + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpc;
+    register mp_word r;
+    register int ix;
+
+    /* alias for a->dp [source] */
+    tmpa = a->dp;
+
+    /* alias for c->dp [dest] */
+    tmpc = c->dp;
+
+    /* zero carry */
+    u = 0;
+    for (ix = 0; ix < pa; ix++) {
+      /* compute product and carry sum for this term */
+      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+
+      /* mask off higher bits to get a single digit */
+      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* send carry into next iteration */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* store final carry [if any] */
+    *tmpc++ = u;
+
+    /* now zero digits above the top */
+    for (; pa < olduse; pa++) {
+       *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_mul_d.c */
+
+/* Start: bn_mp_mulmod.c */
+#line 0 "bn_mp_mulmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a * b (mod c) */
+int
+mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_mulmod.c */
+
+/* Start: bn_mp_multi.c */
+#line 0 "bn_mp_multi.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+#include <stdarg.h>
+
+int mp_init_multi(mp_int *mp, ...) 
+{
+    mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+    int n = 0;                 /* Number of ok inits */
+    mp_int* cur_arg = mp;
+    va_list args;
+
+    va_start(args, mp);        /* init args to next argument from caller */
+    while (cur_arg != NULL) {
+        if (mp_init(cur_arg) != MP_OKAY) {
+            /* Oops - error! Back-track and mp_clear what we already
+               succeeded in init-ing, then return error.
+            */
+            va_list clean_args;
+            
+            /* end the current list */
+            va_end(args);
+            
+            /* now start cleaning up */            
+            cur_arg = mp;
+            va_start(clean_args, mp);
+            while (n--) {
+                mp_clear(cur_arg);
+                cur_arg = va_arg(clean_args, mp_int*);
+            }
+            va_end(clean_args);
+            res = MP_MEM;
+            break;
+        }
+        n++;
+        cur_arg = va_arg(args, mp_int*);
+    }
+    va_end(args);
+    return res;                /* Assumed ok, if error flagged above. */
+}
+
+void mp_clear_multi(mp_int *mp, ...) 
+{
+    mp_int* next_mp = mp;
+    va_list args;
+    va_start(args, mp);
+    while (next_mp != NULL) {
+        mp_clear(next_mp);
+        next_mp = va_arg(args, mp_int*);
+    }
+    va_end(args);
+}
+
+/* End: bn_mp_multi.c */
+
+/* Start: bn_mp_n_root.c */
+#line 0 "bn_mp_n_root.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* find the n'th root of an integer 
+ *
+ * Result found such that (c)^b <= a and (c+1)^b > a 
+ *
+ * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where each step involves a fair bit.  This
+ * is not meant to find huge roots [square and cube at most].
+ */
+int
+mp_n_root (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t1, t2, t3;
+  int     res, neg;
+
+  /* input must be positive if b is even */
+  if ((b & 1) == 0 && a->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init (&t1)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init (&t2)) != MP_OKAY) {
+    goto __T1;
+  }
+
+  if ((res = mp_init (&t3)) != MP_OKAY) {
+    goto __T2;
+  }
+
+  /* if a is negative fudge the sign but keep track */
+  neg = a->sign;
+  a->sign = MP_ZPOS;
+
+  /* t2 = 2 */
+  mp_set (&t2, 2);
+
+  do {
+    /* t1 = t2 */
+    if ((res = mp_copy (&t2, &t1)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    /* t2 = t1 - ((t1^b - a) / (b * t1^(b-1))) */
+    if ((res = mp_expt_d (&t1, b - 1, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) */
+      goto __T3;
+    }
+
+    /* numerator */
+    if ((res = mp_mul (&t3, &t1, &t2)) != MP_OKAY) {	/* t2 = t1^b */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t2, a, &t2)) != MP_OKAY) {	/* t2 = t1^b - a */
+      goto __T3;
+    }
+
+    if ((res = mp_mul_d (&t3, b, &t3)) != MP_OKAY) {	/* t3 = t1^(b-1) * b  */
+      goto __T3;
+    }
+
+    if ((res = mp_div (&t2, &t3, &t3, NULL)) != MP_OKAY) {	/* t3 = (t1^b - a)/(b * t1^(b-1)) */
+      goto __T3;
+    }
+
+    if ((res = mp_sub (&t1, &t3, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+  }
+  while (mp_cmp (&t1, &t2) != MP_EQ);
+
+  /* result can be off by a few so check */
+  for (;;) {
+    if ((res = mp_expt_d (&t1, b, &t2)) != MP_OKAY) {
+      goto __T3;
+    }
+
+    if (mp_cmp (&t2, a) == MP_GT) {
+      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
+	goto __T3;
+      }
+    } else {
+      break;
+    }
+  }
+
+  /* reset the sign of a first */
+  a->sign = neg;
+
+  /* set the result */
+  mp_exch (&t1, c);
+
+  /* set the sign of the result */
+  c->sign = neg;
+
+  res = MP_OKAY;
+
+__T3:mp_clear (&t3);
+__T2:mp_clear (&t2);
+__T1:mp_clear (&t1);
+  return res;
+}
+
+/* End: bn_mp_n_root.c */
+
+/* Start: bn_mp_neg.c */
+#line 0 "bn_mp_neg.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* b = -a */
+int
+mp_neg (mp_int * a, mp_int * b)
+{
+  int     res;
+  if ((res = mp_copy (a, b)) != MP_OKAY) {
+    return res;
+  }
+  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  return MP_OKAY;
+}
+
+/* End: bn_mp_neg.c */
+
+/* Start: bn_mp_or.c */
+#line 0 "bn_mp_or.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* OR two ints together */
+int
+mp_or (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] |= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_or.c */
+
+/* Start: bn_mp_prime_fermat.c */
+#line 0 "bn_mp_prime_fermat.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* performs one Fermat test.
+ * 
+ * If "a" were prime then b^a == b (mod a) since the order of
+ * the multiplicative sub-group would be phi(a) = a-1.  That means
+ * it would be the same as b^(a mod (a-1)) == b^1 == b (mod a).
+ *
+ * Sets result to 1 if the congruence holds, or zero otherwise.
+ */
+int
+mp_prime_fermat (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  t;
+  int     err;
+
+  /* default to fail */
+  *result = 0;
+
+  /* init t */
+  if ((err = mp_init (&t)) != MP_OKAY) {
+    return err;
+  }
+
+  /* compute t = b^a mod a */
+  if ((err = mp_exptmod (b, a, a, &t)) != MP_OKAY) {
+    goto __T;
+  }
+
+  /* is it equal to b? */
+  if (mp_cmp (&t, b) == MP_EQ) {
+    *result = 1;
+  }
+
+  err = MP_OKAY;
+__T:mp_clear (&t);
+  return err;
+}
+
+/* End: bn_mp_prime_fermat.c */
+
+/* Start: bn_mp_prime_is_divisible.c */
+#line 0 "bn_mp_prime_is_divisible.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* determines if an integers is divisible by one of the first 256 primes or not
+ *
+ * sets result to 0 if not, 1 if yes
+ */
+int
+mp_prime_is_divisible (mp_int * a, int *result)
+{
+  int     err, ix;
+  mp_digit res;
+
+  /* default to not */
+  *result = 0;
+
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+    /* is it equal to the prime? */
+    if (mp_cmp_d (a, __prime_tab[ix]) == MP_EQ) {
+      *result = 1;
+      return MP_OKAY;
+    }
+
+    /* what is a mod __prime_tab[ix] */
+    if ((err = mp_mod_d (a, __prime_tab[ix], &res)) != MP_OKAY) {
+      return err;
+    }
+
+    /* is the residue zero? */
+    if (res == 0) {
+      *result = 1;
+      return MP_OKAY;
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_prime_is_divisible.c */
+
+/* Start: bn_mp_prime_is_prime.c */
+#line 0 "bn_mp_prime_is_prime.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* performs a variable number of rounds of Miller-Rabin
+ *
+ * Probability of error after t rounds is no more than
+ * (1/4)^t when 1 <= t <= 256
+ *
+ * Sets result to 1 if probably prime, 0 otherwise
+ */
+int
+mp_prime_is_prime (mp_int * a, int t, int *result)
+{
+  mp_int  b;
+  int     ix, err, res;
+
+  /* default to no */
+  *result = 0;
+
+  /* valid value of t? */
+  if (t < 1 || t > PRIME_SIZE) {
+    return MP_VAL;
+  }
+
+  /* is the input equal to one of the primes in the table? */
+  for (ix = 0; ix < PRIME_SIZE; ix++) {
+      if (mp_cmp_d(a, __prime_tab[ix]) == MP_EQ) {
+         *result = 1;
+         return MP_OKAY;
+      }
+  }
+
+  /* first perform trial division */
+  if ((err = mp_prime_is_divisible (a, &res)) != MP_OKAY) {
+    return err;
+  }
+  if (res == 1) {
+    return MP_OKAY;
+  }
+
+  /* now perform the miller-rabin rounds */
+  if ((err = mp_init (&b)) != MP_OKAY) {
+    return err;
+  }
+
+  for (ix = 0; ix < t; ix++) {
+    /* set the prime */
+    mp_set (&b, __prime_tab[ix]);
+
+    if ((err = mp_prime_miller_rabin (a, &b, &res)) != MP_OKAY) {
+      goto __B;
+    }
+
+    if (res == 0) {
+      goto __B;
+    }
+  }
+
+  /* passed the test */
+  *result = 1;
+__B:mp_clear (&b);
+  return err;
+}
+
+/* End: bn_mp_prime_is_prime.c */
+
+/* Start: bn_mp_prime_miller_rabin.c */
+#line 0 "bn_mp_prime_miller_rabin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Miller-Rabin test of "a" to the base of "b" as described in 
+ * HAC pp. 139 Algorithm 4.24
+ *
+ * Sets result to 0 if definitely composite or 1 if probably prime.
+ * Randomly the chance of error is no more than 1/4 and often 
+ * very much lower.
+ */
+int
+mp_prime_miller_rabin (mp_int * a, mp_int * b, int *result)
+{
+  mp_int  n1, y, r;
+  int     s, j, err;
+
+  /* default */
+  *result = 0;
+
+  /* get n1 = a - 1 */
+  if ((err = mp_init_copy (&n1, a)) != MP_OKAY) {
+    return err;
+  }
+  if ((err = mp_sub_d (&n1, 1, &n1)) != MP_OKAY) {
+    goto __N1;
+  }
+
+  /* set 2^s * r = n1 */
+  if ((err = mp_init_copy (&r, &n1)) != MP_OKAY) {
+    goto __N1;
+  }
+  s = 0;
+  while (mp_iseven (&r) == 1) {
+    ++s;
+    if ((err = mp_div_2 (&r, &r)) != MP_OKAY) {
+      goto __R;
+    }
+  }
+
+  /* compute y = b^r mod a */
+  if ((err = mp_init (&y)) != MP_OKAY) {
+    goto __R;
+  }
+  if ((err = mp_exptmod (b, &r, a, &y)) != MP_OKAY) {
+    goto __Y;
+  }
+
+  /* if y != 1 and y != n1 do */
+  if (mp_cmp_d (&y, 1) != MP_EQ && mp_cmp (&y, &n1) != MP_EQ) {
+    j = 1;
+    /* while j <= s-1 and y != n1 */
+    while ((j <= (s - 1)) && mp_cmp (&y, &n1) != MP_EQ) {
+      if ((err = mp_sqrmod (&y, a, &y)) != MP_OKAY) {
+	goto __Y;
+      }
+
+      /* if y == 1 then composite */
+      if (mp_cmp_d (&y, 1) == MP_EQ) {
+	goto __Y;
+      }
+
+      ++j;
+    }
+
+    /* if y != n1 then composite */
+    if (mp_cmp (&y, &n1) != MP_EQ) {
+      goto __Y;
+    }
+  }
+
+  /* probably prime now */
+  *result = 1;
+__Y:mp_clear (&y);
+__R:mp_clear (&r);
+__N1:mp_clear (&n1);
+  return err;
+}
+
+/* End: bn_mp_prime_miller_rabin.c */
+
+/* Start: bn_mp_prime_next_prime.c */
+#line 0 "bn_mp_prime_next_prime.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* finds the next prime after the number "a" using "t" trials
+ * of Miller-Rabin.
+ */
+int mp_prime_next_prime(mp_int *a, int t)
+{
+   int err, res;
+
+   if (mp_iseven(a) == 1) {
+      /* force odd */
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY) {
+         return err;
+      }
+   } else {
+      /* force to next odd number */
+      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   for (;;) {
+      /* is this prime? */
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
+         return err;
+      }
+
+      if (res == 1) {
+         break;
+      }
+
+      /* add two, next candidate */
+      if ((err = mp_add_d(a, 2, a)) != MP_OKAY) {
+         return err;
+      }
+   }
+
+   return MP_OKAY;
+}
+
+
+/* End: bn_mp_prime_next_prime.c */
+
+/* Start: bn_mp_rand.c */
+#line 0 "bn_mp_rand.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* makes a pseudo-random int of a given size */
+int
+mp_rand (mp_int * a, int digits)
+{
+  int     res;
+  mp_digit d;
+
+  mp_zero (a);
+  if (digits <= 0) {
+    return MP_OKAY;
+  }
+
+  /* first place a random non-zero digit */
+  do {
+    d = ((mp_digit) abs (rand ()));
+  } while (d == 0);
+
+  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
+    return res;
+  }
+
+  while (digits-- > 0) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+      return res;
+    }
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  return MP_OKAY;
+}
+
+/* End: bn_mp_rand.c */
+
+/* Start: bn_mp_read_signed_bin.c */
+#line 0 "bn_mp_read_signed_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* read signed bin, big endian, first byte is 0==positive or 1==negative */
+int
+mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+
+  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
+    return res;
+  }
+  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_signed_bin.c */
+
+/* Start: bn_mp_read_unsigned_bin.c */
+#line 0 "bn_mp_read_unsigned_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reads a unsigned char array, assumes the msb is stored first [big endian] */
+int
+mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
+{
+  int     res;
+  mp_zero (a);
+  while (c-- > 0) {
+    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
+      return res;
+    }
+
+    if (DIGIT_BIT != 7) {
+      a->dp[0] |= *b++;
+      a->used += 1;
+    } else {
+      a->dp[0] = (*b & MP_MASK);
+      a->dp[1] |= ((*b++ >> 7U) & 1);
+      a->used += 2;
+    }
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_read_unsigned_bin.c */
+
+/* Start: bn_mp_reduce.c */
+#line 0 "bn_mp_reduce.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* pre-calculate the value required for Barrett reduction
+ * For a given modulus "b" it calulates the value required in "a"
+ */
+int
+mp_reduce_setup (mp_int * a, mp_int * b)
+{
+  int     res;
+  
+  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
+    return res;
+  }
+  res = mp_div (a, b, a, NULL);
+  return res;
+}
+
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup
+ * From HAC pp.604 Algorithm 14.42
+ */
+int
+mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
+{
+  mp_int  q;
+  int     res, um = m->used;
+
+  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
+    return res;
+  }
+
+  /* q1 = x / b^(k-1)  */
+  mp_rshd (&q, um - 1);         
+
+  /* according to HAC this is optimization is ok */
+  if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) {
+    if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  } else {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+      goto CLEANUP;
+    }
+  }
+
+  /* q3 = q2 / b^(k+1) */
+  mp_rshd (&q, um + 1);         
+
+  /* x = x mod b^(k+1), quick (no division) */
+  if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* q = q * m mod b^(k+1), quick (no division) */
+  if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* x = x - q */
+  if ((res = mp_sub (x, &q, x)) != MP_OKAY) {
+    goto CLEANUP;
+  }
+
+  /* If x < 0, add b^(k+1) to it */
+  if (mp_cmp_d (x, 0) == MP_LT) {
+    mp_set (&q, 1);
+    if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
+      goto CLEANUP;
+    if ((res = mp_add (x, &q, x)) != MP_OKAY)
+      goto CLEANUP;
+  }
+
+  /* Back off if it's too big */
+  while (mp_cmp (x, m) != MP_LT) {
+    if ((res = s_mp_sub (x, m, x)) != MP_OKAY) {
+      break;
+    }
+  }
+
+CLEANUP:
+  mp_clear (&q);
+
+  return res;
+}
+
+/* End: bn_mp_reduce.c */
+
+/* Start: bn_mp_rshd.c */
+#line 0 "bn_mp_rshd.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shift right a certain amount of digits */
+void
+mp_rshd (mp_int * a, int b)
+{
+  int     x;
+
+  /* if b <= 0 then ignore it */
+  if (b <= 0) {
+    return;
+  }
+
+  /* if b > used then simply zero it and return */
+  if (a->used <= b) {
+    mp_zero (a);
+    return;
+  }
+
+  {
+    register mp_digit *tmpa, *tmpaa;
+
+    /* shift the digits down */
+
+    /* base */
+    tmpa = a->dp;
+
+    /* offset into digits */
+    tmpaa = a->dp + b;
+
+    /* this is implemented as a sliding window where 
+     * the window is b-digits long and digits from 
+     * the top of the window are copied to the bottom
+     *
+     * e.g.
+
+     b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+                 /\                   |      ---->
+                  \-------------------/      ---->
+     */
+    for (x = 0; x < (a->used - b); x++) {
+      *tmpa++ = *tmpaa++;
+    }
+
+    /* zero the top digits */
+    for (; x < a->used; x++) {
+      *tmpa++ = 0;
+    }
+  }
+  mp_clamp (a);
+}
+
+/* End: bn_mp_rshd.c */
+
+/* Start: bn_mp_set.c */
+#line 0 "bn_mp_set.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set to a digit */
+void
+mp_set (mp_int * a, mp_digit b)
+{
+  mp_zero (a);
+  a->dp[0] = b & MP_MASK;
+  a->used = (a->dp[0] != 0) ? 1 : 0;
+}
+
+/* End: bn_mp_set.c */
+
+/* Start: bn_mp_set_int.c */
+#line 0 "bn_mp_set_int.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set a 32-bit const */
+int
+mp_set_int (mp_int * a, unsigned int b)
+{
+  int     x, res;
+
+  mp_zero (a);
+  /* set four bits at a time */
+  for (x = 0; x < 8; x++) {
+    /* shift the number up four bits */
+    if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) {
+      return res;
+    }
+
+    /* OR in the top four bits of the source */
+    a->dp[0] |= (b >> 28) & 15;
+
+    /* shift the source up to the next four bits */
+    b <<= 4;
+
+    /* ensure that digits are not clamped off */
+    a->used += 32 / DIGIT_BIT + 2;
+  }
+  mp_clamp (a);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_set_int.c */
+
+/* Start: bn_mp_shrink.c */
+#line 0 "bn_mp_shrink.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* shrink a bignum */
+int
+mp_shrink (mp_int * a)
+{
+  if (a->alloc != a->used) {
+    if ((a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+      return MP_MEM;
+    }
+    a->alloc = a->used;
+  }
+  return MP_OKAY;
+}
+
+/* End: bn_mp_shrink.c */
+
+/* Start: bn_mp_signed_bin_size.c */
+#line 0 "bn_mp_signed_bin_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the size for an signed equivalent */
+int
+mp_signed_bin_size (mp_int * a)
+{
+  return 1 + mp_unsigned_bin_size (a);
+}
+
+/* End: bn_mp_signed_bin_size.c */
+
+/* Start: bn_mp_sqr.c */
+#line 0 "bn_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* computes b = a*a */
+int
+mp_sqr (mp_int * a, mp_int * b)
+{
+  int     res;
+  if (a->used > KARATSUBA_SQR_CUTOFF) {
+    res = mp_karatsuba_sqr (a, b);
+  } else {
+
+    /* can we use the fast multiplier? */
+    if ((a->used * 2 + 1) < 512 && a->used < (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else {
+      res = s_mp_sqr (a, b);
+    }
+  }
+  b->sign = MP_ZPOS;
+  return res;
+}
+
+/* End: bn_mp_sqr.c */
+
+/* Start: bn_mp_sqrmod.c */
+#line 0 "bn_mp_sqrmod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* c = a * a (mod b) */
+int
+mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sqr (a, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, b, c);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_sqrmod.c */
+
+/* Start: bn_mp_sub.c */
+#line 0 "bn_mp_sub.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* high level subtraction (handles signs) */
+int
+mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     sa, sb, res;
+
+  sa = a->sign;
+  sb = b->sign;
+
+  if (sa != sb) {
+    /* subtract a negative from a positive, OR */
+    /* subtract a positive from a negative. */
+    /* In either case, ADD their magnitudes, */
+    /* and use the sign of the first number. */
+    c->sign = sa;
+    res = s_mp_add (a, b, c);
+  } else {
+    /* subtract a positive from a positive, OR */
+    /* subtract a negative from a negative. */
+    /* First, take the difference between their */
+    /* magnitudes, then... */
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      /* Copy the sign from the first */
+      c->sign = sa;
+      /* The first has a larger or equal magnitude */
+      res = s_mp_sub (a, b, c);
+    } else {
+      /* The result has the *opposite* sign from */
+      /* the first number. */
+      c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+      /* The second has a larger magnitude */
+      res = s_mp_sub (b, a, c);
+    }
+  }
+  return res;
+}
+
+
+/* End: bn_mp_sub.c */
+
+/* Start: bn_mp_sub_d.c */
+#line 0 "bn_mp_sub_d.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* single digit subtraction */
+int
+mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
+{
+  mp_int  t;
+  int     res;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+  mp_set (&t, b);
+  res = mp_sub (a, &t, c);
+
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_sub_d.c */
+
+/* Start: bn_mp_submod.c */
+#line 0 "bn_mp_submod.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* d = a - b (mod c) */
+int
+mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
+{
+  int     res;
+  mp_int  t;
+
+
+  if ((res = mp_init (&t)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_sub (a, b, &t)) != MP_OKAY) {
+    mp_clear (&t);
+    return res;
+  }
+  res = mp_mod (&t, c, d);
+  mp_clear (&t);
+  return res;
+}
+
+/* End: bn_mp_submod.c */
+
+/* Start: bn_mp_to_signed_bin.c */
+#line 0 "bn_mp_to_signed_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* store in signed [big endian] format */
+int
+mp_to_signed_bin (mp_int * a, unsigned char *b)
+{
+  int     res;
+
+  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
+    return res;
+  }
+  b[0] = (unsigned char) ((a->sign == MP_ZPOS) ? 0 : 1);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_to_signed_bin.c */
+
+/* Start: bn_mp_to_unsigned_bin.c */
+#line 0 "bn_mp_to_unsigned_bin.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* store in unsigned [big endian] format */
+int
+mp_to_unsigned_bin (mp_int * a, unsigned char *b)
+{
+  int     x, res;
+  mp_int  t;
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  x = 0;
+  while (mp_iszero (&t) == 0) {
+    if (DIGIT_BIT != 7) {
+      b[x++] = (unsigned char) (t.dp[0] & 255);
+    } else {
+      b[x++] = (unsigned char) (t.dp[0] | ((t.dp[1] & 0x01) << 7));
+    }
+    if ((res = mp_div_2d (&t, 8, &t, NULL)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+  }
+  bn_reverse (b, x);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_to_unsigned_bin.c */
+
+/* Start: bn_mp_unsigned_bin_size.c */
+#line 0 "bn_mp_unsigned_bin_size.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the size for an unsigned equivalent */
+int
+mp_unsigned_bin_size (mp_int * a)
+{
+  int     size = mp_count_bits (a);
+  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
+}
+
+/* End: bn_mp_unsigned_bin_size.c */
+
+/* Start: bn_mp_xor.c */
+#line 0 "bn_mp_xor.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* XOR two ints together */
+int
+mp_xor (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     res, ix, px;
+  mp_int  t, *x;
+
+  if (a->used > b->used) {
+    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+      return res;
+    }
+    px = b->used;
+    x = b;
+  } else {
+    if ((res = mp_init_copy (&t, b)) != MP_OKAY) {
+      return res;
+    }
+    px = a->used;
+    x = a;
+  }
+
+  for (ix = 0; ix < px; ix++) {
+    t.dp[ix] ^= x->dp[ix];
+  }
+  mp_clamp (&t);
+  mp_exch (c, &t);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_mp_xor.c */
+
+/* Start: bn_mp_zero.c */
+#line 0 "bn_mp_zero.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* set to zero */
+void
+mp_zero (mp_int * a)
+{
+  a->sign = MP_ZPOS;
+  a->used = 0;
+  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+}
+
+/* End: bn_mp_zero.c */
+
+/* Start: bn_prime_tab.c */
+#line 0 "bn_prime_tab.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+const mp_digit __prime_tab[] = {
+  0x0002, 0x0003, 0x0005, 0x0007, 0x000B, 0x000D, 0x0011, 0x0013,
+  0x0017, 0x001D, 0x001F, 0x0025, 0x0029, 0x002B, 0x002F, 0x0035,
+  0x003B, 0x003D, 0x0043, 0x0047, 0x0049, 0x004F, 0x0053, 0x0059,
+  0x0061, 0x0065, 0x0067, 0x006B, 0x006D, 0x0071, 0x007F,
+#ifndef MP_8BIT
+  0x0083,
+  0x0089, 0x008B, 0x0095, 0x0097, 0x009D, 0x00A3, 0x00A7, 0x00AD,
+  0x00B3, 0x00B5, 0x00BF, 0x00C1, 0x00C5, 0x00C7, 0x00D3, 0x00DF,
+  0x00E3, 0x00E5, 0x00E9, 0x00EF, 0x00F1, 0x00FB, 0x0101, 0x0107,
+  0x010D, 0x010F, 0x0115, 0x0119, 0x011B, 0x0125, 0x0133, 0x0137,
+
+  0x0139, 0x013D, 0x014B, 0x0151, 0x015B, 0x015D, 0x0161, 0x0167,
+  0x016F, 0x0175, 0x017B, 0x017F, 0x0185, 0x018D, 0x0191, 0x0199,
+  0x01A3, 0x01A5, 0x01AF, 0x01B1, 0x01B7, 0x01BB, 0x01C1, 0x01C9,
+  0x01CD, 0x01CF, 0x01D3, 0x01DF, 0x01E7, 0x01EB, 0x01F3, 0x01F7,
+  0x01FD, 0x0209, 0x020B, 0x021D, 0x0223, 0x022D, 0x0233, 0x0239,
+  0x023B, 0x0241, 0x024B, 0x0251, 0x0257, 0x0259, 0x025F, 0x0265,
+  0x0269, 0x026B, 0x0277, 0x0281, 0x0283, 0x0287, 0x028D, 0x0293,
+  0x0295, 0x02A1, 0x02A5, 0x02AB, 0x02B3, 0x02BD, 0x02C5, 0x02CF,
+
+  0x02D7, 0x02DD, 0x02E3, 0x02E7, 0x02EF, 0x02F5, 0x02F9, 0x0301,
+  0x0305, 0x0313, 0x031D, 0x0329, 0x032B, 0x0335, 0x0337, 0x033B,
+  0x033D, 0x0347, 0x0355, 0x0359, 0x035B, 0x035F, 0x036D, 0x0371,
+  0x0373, 0x0377, 0x038B, 0x038F, 0x0397, 0x03A1, 0x03A9, 0x03AD,
+  0x03B3, 0x03B9, 0x03C7, 0x03CB, 0x03D1, 0x03D7, 0x03DF, 0x03E5,
+  0x03F1, 0x03F5, 0x03FB, 0x03FD, 0x0407, 0x0409, 0x040F, 0x0419,
+  0x041B, 0x0425, 0x0427, 0x042D, 0x043F, 0x0443, 0x0445, 0x0449,
+  0x044F, 0x0455, 0x045D, 0x0463, 0x0469, 0x047F, 0x0481, 0x048B,
+
+  0x0493, 0x049D, 0x04A3, 0x04A9, 0x04B1, 0x04BD, 0x04C1, 0x04C7,
+  0x04CD, 0x04CF, 0x04D5, 0x04E1, 0x04EB, 0x04FD, 0x04FF, 0x0503,
+  0x0509, 0x050B, 0x0511, 0x0515, 0x0517, 0x051B, 0x0527, 0x0529,
+  0x052F, 0x0551, 0x0557, 0x055D, 0x0565, 0x0577, 0x0581, 0x058F,
+  0x0593, 0x0595, 0x0599, 0x059F, 0x05A7, 0x05AB, 0x05AD, 0x05B3,
+  0x05BF, 0x05C9, 0x05CB, 0x05CF, 0x05D1, 0x05D5, 0x05DB, 0x05E7,
+  0x05F3, 0x05FB, 0x0607, 0x060D, 0x0611, 0x0617, 0x061F, 0x0623,
+  0x062B, 0x062F, 0x063D, 0x0641, 0x0647, 0x0649, 0x064D, 0x0653
+#endif
+};
+
+/* End: bn_prime_tab.c */
+
+/* Start: bn_radix.c */
+#line 0 "bn_radix.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* chars used in radix conversions */
+static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+
+/* read a string [ASCII] in a given radix */
+int
+mp_read_radix (mp_int * a, char *str, int radix)
+{
+  int     y, res, neg;
+  char    ch;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if (*str == '-') {
+    ++str;
+    neg = MP_NEG;
+  } else {
+    neg = MP_ZPOS;
+  }
+
+  mp_zero (a);
+  while (*str) {
+    ch = (char) ((radix < 36) ? toupper (*str) : *str);
+    for (y = 0; y < 64; y++) {
+      if (ch == s_rmap[y]) {
+	break;
+      }
+    }
+
+    if (y < radix) {
+      if ((res = mp_mul_d (a, (mp_digit) radix, a)) != MP_OKAY) {
+	return res;
+      }
+      if ((res = mp_add_d (a, (mp_digit) y, a)) != MP_OKAY) {
+	return res;
+      }
+    } else {
+      break;
+    }
+    ++str;
+  }
+  a->sign = neg;
+  return MP_OKAY;
+}
+
+/* stores a bignum as a ASCII string in a given radix (2..64) */
+int
+mp_toradix (mp_int * a, char *str, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  if (radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  if (t.sign == MP_NEG) {
+    ++_s;
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = s_rmap[d];
+    ++digs;
+  }
+  bn_reverse ((unsigned char *)_s, digs);
+  *str++ = '\0';
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* returns size of ASCII reprensentation */
+int
+mp_radix_size (mp_int * a, int radix)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+
+  /* special case for binary */
+  if (radix == 2) {
+    return mp_count_bits (a) + (a->sign == MP_NEG ? 1 : 0) + 1;
+  }
+
+  if (radix < 2 || radix > 64) {
+    return 0;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return 0;
+  }
+
+  digs = 0;
+  if (t.sign == MP_NEG) {
+    ++digs;
+    t.sign = MP_ZPOS;
+  }
+
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return 0;
+    }
+    ++digs;
+  }
+  mp_clear (&t);
+  return digs + 1;
+}
+
+/* read a bigint from a file stream in ASCII */
+int mp_fread(mp_int *a, int radix, FILE *stream)
+{
+   int err, ch, neg, y;
+   
+   /* clear a */
+   mp_zero(a);
+   
+   /* if first digit is - then set negative */
+   ch = fgetc(stream);
+   if (ch == '-') {
+      neg = MP_NEG;
+      ch = fgetc(stream);
+   } else {
+      neg = MP_ZPOS;
+   }
+   
+   for (;;) {
+      /* find y in the radix map */
+      for (y = 0; y < radix; y++) {
+          if (s_rmap[y] == ch) {
+             break;
+          }
+      }
+      if (y == radix) {
+         break;
+      }
+      
+      /* shift up and add */
+      if ((err = mp_mul_d(a, radix, a)) != MP_OKAY) {
+         return err;
+      }
+      if ((err = mp_add_d(a, y, a)) != MP_OKAY) {
+         return err;
+      }
+      
+      ch = fgetc(stream);
+   }
+   if (mp_cmp_d(a, 0) != MP_EQ) {
+      a->sign = neg;
+   }
+   
+   return MP_OKAY;
+}
+
+int mp_fwrite(mp_int *a, int radix, FILE *stream)
+{
+   char *buf;
+   int err, len, x;
+   
+   len = mp_radix_size(a, radix);
+   if (len == 0) {
+      return MP_VAL;
+   }
+   
+   buf = malloc(len);
+   if (buf == NULL) {
+      return MP_MEM;
+   }
+   
+   if ((err = mp_toradix(a, buf, radix)) != MP_OKAY) {
+      free(buf);
+      return err;
+   }
+   
+   for (x = 0; x < len; x++) {
+       if (fputc(buf[x], stream) == EOF) {
+          free(buf);
+          return MP_VAL;
+       }
+   }
+   
+   free(buf);
+   return MP_OKAY;
+}
+
+
+/* End: bn_radix.c */
+
+/* Start: bn_reverse.c */
+#line 0 "bn_reverse.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* reverse an array, used for radix code */
+void
+bn_reverse (unsigned char *s, int len)
+{
+  int     ix, iy;
+  unsigned char t;
+
+  ix = 0;
+  iy = len - 1;
+  while (ix < iy) {
+    t     = s[ix];
+    s[ix] = s[iy];
+    s[iy] = t;
+    ++ix;
+    --iy;
+  }
+}
+
+/* End: bn_reverse.c */
+
+/* Start: bn_s_mp_add.c */
+#line 0 "bn_s_mp_add.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
+int
+s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+{
+  mp_int *x;
+  int     olduse, res, min, max;
+
+  /* find sizes, we let |a| <= |b| which means we have to sort
+   * them.  "x" will point to the input with the most digits
+   */
+  if (a->used > b->used) {
+    min = b->used;
+    max = a->used;
+    x = a;
+  } else {
+    min = a->used;
+    max = b->used;
+    x = b;
+  }
+
+  /* init result */
+  if (c->alloc < max + 1) {
+    if ((res = mp_grow (c, max + 1)) != MP_OKAY) {
+      return res;
+    }
+  }
+
+  /* get old used digit count and set new one */
+  olduse = c->used;
+  c->used = max + 1;
+
+  /* set the carry to zero */
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+
+    /* first input */
+    tmpa = a->dp;
+
+    /* second input */
+    tmpb = b->dp;
+
+    /* destination */
+    tmpc = c->dp;
+
+    /* zero the carry */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+      /* take away carry bit from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, that is in A+B 
+     * if A or B has more digits add those in 
+     */
+    if (min != max) {
+      for (; i < max; i++) {
+        /* T[i] = X[i] + U */
+        *tmpc = x->dp[i] + u;
+
+        /* U = carry bit of T[i] */
+        u = *tmpc >> ((mp_digit)DIGIT_BIT);
+
+        /* take away carry bit from T[i] */
+        *tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above oldused */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_add.c */
+
+/* Start: bn_s_mp_mul_digs.c */
+#line 0 "bn_s_mp_mul_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and only computes upto digs digits of result
+ * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+ * many digits of output are created.
+ */
+int
+s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+  /* can we use the fast multiplier? */
+  if (((digs) < MP_WARRAY) &&
+      MIN (a->used, b->used) < 
+          (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
+    return res;
+  }
+  t.used = digs;
+
+  /* compute the digits of the product directly */
+  pa = a->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* set the carry to zero */
+    u = 0;
+
+    /* limit ourselves to making digs digits of output */
+    pb = MIN (b->used, digs - ix);
+
+    /* setup some aliases */
+    /* copy of the digit from a used within the nested loop */
+    tmpx = a->dp[ix];
+    
+    /* an alias for the destination shifted ix places */
+    tmpt = t.dp + ix;
+    
+    /* an alias for the digits of b */
+    tmpy = b->dp;
+
+    /* compute the columns of the output and propagate the carry */
+    for (iy = 0; iy < pb; iy++) {
+      /* compute the column as a mp_word */
+      r = ((mp_word) *tmpt) + 
+          ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+          ((mp_word) u);
+
+      /* the new column is the lower part of the result */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get the carry word from the result */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    /* set carry if it is placed below digs */
+    if (ix + iy < digs) {
+      *tmpt = u;
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, c);
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_mul_digs.c */
+
+/* Start: bn_s_mp_mul_high_digs.c */
+#line 0 "bn_s_mp_mul_high_digs.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* multiplies |a| * |b| and does not compute the lower digs digits
+ * [meant to get the higher part of the product]
+ */
+int
+s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+{
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;
+
+
+  /* can we use the fast multiplier? */
+  if (((a->used + b->used + 1) < MP_WARRAY)
+      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
+  }
+
+  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = a->used + b->used + 1;
+
+  pa = a->used;
+  pb = b->used;
+  for (ix = 0; ix < pa; ix++) {
+    /* clear the carry */
+    u = 0;
+
+    /* left hand side of A[ix] * B[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias to the address of where the digits will be stored */
+    tmpt = &(t.dp[digs]);
+
+    /* alias for where to read the right hand side from */
+    tmpy = b->dp + (digs - ix);
+
+    for (iy = digs - ix; iy < pb; iy++) {
+      /* calculate the double precision result */
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);
+
+      /* get the lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* carry the carry */
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpt = u;
+  }
+  mp_clamp (&t);
+  mp_exch (&t, c);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_mul_high_digs.c */
+
+/* Start: bn_s_mp_sqr.c */
+#line 0 "bn_s_mp_sqr.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
+int
+s_mp_sqr (mp_int * a, mp_int * b)
+{
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r, u;
+  mp_digit tmpx, *tmpt;
+
+  pa = a->used;
+  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
+    return res;
+  }
+  t.used = pa + pa + 1;
+
+  for (ix = 0; ix < pa; ix++) {
+    /* first calculate the digit at 2*ix */
+    /* calculate double precision result */
+    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+
+    /* store lower part in result */
+    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+
+    /* get the carry */
+    u = (r >> ((mp_word) DIGIT_BIT));
+
+    /* left hand side of A[ix] * A[iy] */
+    tmpx = a->dp[ix];
+
+    /* alias for where to store the results */
+    tmpt = &(t.dp[ix + ix + 1]);
+    for (iy = ix + 1; iy < pa; iy++) {
+      /* first calculate the product */
+      r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
+
+      /* now calculate the double precision result, note we use
+       * addition instead of *2 since its easier to optimize
+       */
+      r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
+
+      /* store lower part */
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+
+      /* get carry */
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+    r = ((mp_word) * tmpt) + u;
+    *tmpt = (mp_digit) (r & ((mp_word) MP_MASK));
+    u = (r >> ((mp_word) DIGIT_BIT));
+    /* propagate upwards */
+    ++tmpt;
+    while (u != ((mp_word) 0)) {
+      r = ((mp_word) * tmpt) + ((mp_word) 1);
+      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u = (r >> ((mp_word) DIGIT_BIT));
+    }
+  }
+
+  mp_clamp (&t);
+  mp_exch (&t, b);
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_sqr.c */
+
+/* Start: bn_s_mp_sub.c */
+#line 0 "bn_s_mp_sub.c"
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+int
+s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+{
+  int     olduse, res, min, max;
+
+  /* find sizes */
+  min = b->used;
+  max = a->used;
+
+  /* init result */
+  if (c->alloc < max) {
+    if ((res = mp_grow (c, max)) != MP_OKAY) {
+      return res;
+    }
+  }
+  olduse = c->used;
+  c->used = max;
+
+  /* sub digits from lower part */
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;
+
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;
+
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;
+
+      /* U = carry bit of T[i]
+       * Note this saves performing an AND operation since
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is required to get the carry
+       */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* now copy higher words if any, e.g. if A has more digits than B  */
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      *tmpc = *tmpa++ - u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
+  }
+
+  mp_clamp (c);
+  return MP_OKAY;
+}
+
+/* End: bn_s_mp_sub.c */
+
+/* EOF */
diff --git a/tommath.h b/tommath.h
index cfd9da1..0d56f02 100644
--- a/tommath.h
+++ b/tommath.h
@@ -1,11 +1,11 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
  *
- * LibTomMath is library that provides for multiple-precision 
+ * LibTomMath is library that provides for multiple-precision
  * integer arithmetic as well as number theoretic functionality.
- * 
+ *
  * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.  
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
  *
  * The library is free for all purposes without any express
  * guarantee it works.
@@ -34,18 +34,18 @@ extern "C" {
 
 #else
 
-/* C on the other hand dosen't care */
-#define  OPT_CAST  
+/* C on the other hand doesn't care */
+#define  OPT_CAST
 
 #endif
 
-/* some default configurations.  
+/* some default configurations.
  *
- * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits 
- * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits 
+ * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
+ * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
  *
- * At the very least a mp_digit must be able to hold 7 bits 
- * [any size beyond that is ok provided it overflow the data type]
+ * At the very least a mp_digit must be able to hold 7 bits
+ * [any size beyond that is ok provided it doesn't overflow the data type]
  */
 #ifdef MP_8BIT
    typedef unsigned char      mp_digit;
@@ -53,7 +53,21 @@ extern "C" {
 #elif defined(MP_16BIT)
    typedef unsigned short     mp_digit;
    typedef unsigned long      mp_word;
+#elif defined(MP_64BIT)
+   /* for GCC only on supported platforms */
+#ifndef CRYPT
+   typedef unsigned long long ulong64;
+   typedef signed long long   long64;
+#endif
+
+   typedef ulong64            mp_digit;
+   typedef unsigned long      mp_word __attribute__ ((mode(TI)));
+
+   #define DIGIT_BIT          60
 #else
+   /* this is the default case, 28-bit digits */
+   
+   /* this is to make porting into LibTomCrypt easier :-) */
 #ifndef CRYPT
    #ifdef _MSC_VER
       typedef unsigned __int64   ulong64;
@@ -61,23 +75,24 @@ extern "C" {
    #else
       typedef unsigned long long ulong64;
       typedef signed long long   long64;
-   #endif   
-#endif   
+   #endif
+#endif
 
-   /* default case */
    typedef unsigned long      mp_digit;
    typedef ulong64            mp_word;
-  
-   #define DIGIT_BIT          28
-#endif  
 
+   #define DIGIT_BIT          28
+#endif
+
+/* otherwise the bits per digit is calculated automatically from the size of a mp_digit */
 #ifndef DIGIT_BIT
    #define DIGIT_BIT     ((CHAR_BIT * sizeof(mp_digit) - 1))  /* bits per digit */
 #endif
 
+
 #define MP_DIGIT_BIT     DIGIT_BIT
 #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)1))
-#define MP_DIGIT_MAX     MP_MASK   
+#define MP_DIGIT_MAX     MP_MASK
 
 /* equalities */
 #define MP_LT        -1   /* less than */
@@ -99,7 +114,14 @@ extern int KARATSUBA_MUL_CUTOFF,
            KARATSUBA_SQR_CUTOFF,
            MONTGOMERY_EXPT_CUTOFF;
 
-#define MP_PREC                 64      /* default digits of precision */
+/* various build options */
+#define MP_PREC                 64      /* default digits of precision (must be power of two) */
+
+/* define this to use lower memory usage routines (exptmods mostly) */
+/* #define MP_LOW_MEM */
+
+/* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
+#define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))
 
 typedef struct  {
     int used, alloc, sign;
@@ -118,6 +140,12 @@ int mp_init(mp_int *a);
 /* free a bignum */
 void mp_clear(mp_int *a);
 
+/* init a null terminated series of arguments */
+int mp_init_multi(mp_int *mp, ...);
+
+/* clear a null terminated series of arguments */
+void mp_clear_multi(mp_int *mp, ...);
+
 /* exchange two ints */
 void mp_exch(mp_int *a, mp_int *b);
 
@@ -143,7 +171,7 @@ void mp_zero(mp_int *a);
 void mp_set(mp_int *a, mp_digit b);
 
 /* set a 32-bit const */
-int mp_set_int(mp_int *a, unsigned long b);
+int mp_set_int(mp_int *a, unsigned int b);
 
 /* copy, b = a */
 int mp_copy(mp_int *a, mp_int *b);
@@ -162,22 +190,22 @@ void mp_rshd(mp_int *a, int b);
 /* left shift by "b" digits */
 int mp_lshd(mp_int *a, int b);
 
-/* c = a / 2^b */
+/* c = a / 2**b */
 int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
 
 /* b = a/2 */
 int mp_div_2(mp_int *a, mp_int *b);
 
-/* c = a * 2^b */
+/* c = a * 2**b */
 int mp_mul_2d(mp_int *a, int b, mp_int *c);
 
 /* b = a*2 */
 int mp_mul_2(mp_int *a, mp_int *b);
 
-/* c = a mod 2^d */
+/* c = a mod 2**d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);
 
-/* computes a = 2^b */
+/* computes a = 2**b */
 int mp_2expt(mp_int *a, int b);
 
 /* makes a pseudo-random int of a given size */
@@ -216,7 +244,7 @@ int mp_sub(mp_int *a, mp_int *b, mp_int *c);
 /* c = a * b */
 int mp_mul(mp_int *a, mp_int *b, mp_int *c);
 
-/* b = a^2 */
+/* b = a*a  */
 int mp_sqr(mp_int *a, mp_int *b);
 
 /* a/b => cb + d == a */
@@ -242,7 +270,7 @@ int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
 /* a/b => cb + d == a */
 int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
 
-/* c = a^b */
+/* c = a**b */
 int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);
 
 /* c = a mod b, 0 <= c < b  */
@@ -271,7 +299,7 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
 /* c = [a, b] or (a*b)/(a, b) */
 int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 
-/* finds one of the b'th root of a, such that |c|^b <= |a| 
+/* finds one of the b'th root of a, such that |c|**b <= |a|
  *
  * returns error if a < 0 and b is even
  */
@@ -288,7 +316,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b);
 
 /* Barrett Reduction, computes a (mod b) with a precomputed value c
  *
- * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
+ * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
  * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
  */
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
@@ -296,12 +324,12 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);
 
-/* computes a = B^n mod b without division or multiplication useful for 
+/* computes a = B**n mod b without division or multiplication useful for
  * normalizing numbers in a Montgomery system.
  */
 int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
 
-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+/* computes x/R == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 
 /* returns 1 if a is a valid DR modulus */
@@ -313,32 +341,38 @@ void mp_dr_setup(mp_int *a, mp_digit *d);
 /* reduces a modulo b using the Diminished Radix method */
 int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
 
-/* d = a^b (mod c) */
+/* d = a**b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 
 /* ---> Primes <--- */
-#define PRIME_SIZE	256	/* number of primes */
 
-/* table of first 256 primes */
+/* number of primes */
+#ifdef MP_8BIT
+   #define PRIME_SIZE      31
+#else
+   #define PRIME_SIZE      256
+#endif
+
+/* table of first PRIME_SIZE primes */
 extern const mp_digit __prime_tab[];
 
-/* result=1 if a is divisible by one of the first 256 primes */
+/* result=1 if a is divisible by one of the first PRIME_SIZE primes */
 int mp_prime_is_divisible(mp_int *a, int *result);
 
-/* performs one Fermat test of "a" using base "b".  
- * Sets result to 0 if composite or 1 if probable prime 
+/* performs one Fermat test of "a" using base "b".
+ * Sets result to 0 if composite or 1 if probable prime
  */
 int mp_prime_fermat(mp_int *a, mp_int *b, int *result);
 
 /* performs one Miller-Rabin test of "a" using base "b".
- * Sets result to 0 if composite or 1 if probable prime 
+ * Sets result to 0 if composite or 1 if probable prime
  */
 int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result);
 
 /* performs t rounds of Miller-Rabin on "a" using the first
  * t prime bases.  Also performs an initial sieve of trial
  * division.  Determines if "a" is prime with probability
- * of error no more than (1/4)^t.
+ * of error no more than (1/4)**t.
  *
  * Sets result to 1 if probably prime, 0 otherwise
  */
@@ -365,6 +399,9 @@ int mp_read_radix(mp_int *a, char *str, int radix);
 int mp_toradix(mp_int *a, char *str, int radix);
 int mp_radix_size(mp_int *a, int radix);
 
+int mp_fread(mp_int *a, int radix, FILE *stream);
+int mp_fwrite(mp_int *a, int radix, FILE *stream);
+
 #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
 #define mp_raw_size(mp)           mp_signed_bin_size(mp)
 #define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
diff --git a/tommath.src b/tommath.src
new file mode 100644
index 0000000..f04f324
--- /dev/null
+++ b/tommath.src
@@ -0,0 +1,2459 @@
+\documentclass[b5paper]{book}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\url}[1]{\mbox{$<${#1}$>$}}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - }
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Gregory Rose \\
+Qualcomm \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text in its entirety is copyrighted \copyright{}2003 by Tom St Denis.  It may not be redistributed 
+electronically or otherwise without the sole permission of the author.  The text is freely re distributable as long as
+it is packaged along with the LibTomMath project in a non-commercial project.  Contact the
+author for other redistribution rights.
+
+This text corresponds to the v0.17 release of the LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: tomstdenis@iahu.ca
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Preface}
+Blah.
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public
+key cryptography.   Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to 
+resist known cryptanalytic attacks.  Typical modern programming languages such as C and Java only provide small 
+single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long.
+
+For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type.  With an 
+x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$.  The original inputs 
+were approximately $21$ and $24$ bits respectively.  If the C language cannot multiply two relatively small values 
+together precisely how does anyone expect it to multiply two values which are considerably larger?
+
+Most advancements in fast multiple precision arithmetic stems from the desire for faster cryptographic primitives.  However, cryptography
+is not the only field of study that can benefit fast large integer routines.  Another auxiliary use for multiple precision integers is 
+high precision floating point data types.  The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$.  
+Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is convention.  Since IEEE is meant to be implemented in 
+hardware the precision of the mantissa is often fairly small (\textit{roughly 23 bits}).  Since the mantissa is merely an 
+integer a large multiple precision integer could be used.  In effect very high precision floating point arithmetic 
+could be performed.  This would be useful where scientific applications must minimize the total output error over long simulations.  
+
+\subsection{Multiple Precision Arithmetic}
+\index{multiple precision}
+Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from
+the C and Java programming languages.  In essence multiple precision arithmetic is a set of operations that can be 
+performed on members of an algebraic group whose precision is not fixed.  The algorithms when implemented to be multiple
+precision can allow a developer to work with any practical precision required.
+
+Typically the arithmetic is performed over the ring of integers denoted by a $\Z$ and referred to casually as ``bignum'' 
+routines.  However, it is possible to have rings of polynomials as well typically denoted by $\Z/p\Z \left [ X \right ]$ 
+which could have variable precision (\textit{or degree}).  This text will discuss implementation of the former, however,
+implementing polynomial basis routines should be relatively easy after reading this text.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision} \index{accuracy}
+Precision is defined loosely as the proximity to the real value a given representation is.  Accuracy is defined as the 
+reproducibility of the result.  For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided 
+it is reproducible.
+
+The benefit of multiple precision representations over single precision representations is that 
+often no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result.  A multiple precision 
+system would augment the precision of the destination to accomodate the result while a single precision system would
+truncate excess bits to maintain a fixed level of precision.
+
+Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of
+modest computer resources.  The only reasonable case where a multiple precision system will lose precision is when
+emulating a floating point data type.  However, with multiple precision integer arithmetic no precision is lost.
+
+\subsection{Basis of Operations}
+At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learnt as children 
+in grade school.  For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for 
+$1,234$ instead they are taught how to long-multiply.  That is to multiply each column using simple single digit 
+multiplications and add the resulting products by column.  The representation that most are familiar with is known as 
+decimal or formally as radix-10. A radix-$n$ representation simply means there are $n$ possible values per digit.  
+For example, binary would be a radix-2 representation.
+
+In essence computer based multiple precision arithmetic is very much the same.  The most notable difference is the usage
+of a binary friendly radix.  That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine 
+register.  Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and 
+squaring instead of traditional long-hand algorithms.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms.  That is 
+to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by
+authors of other texts on the subject.  Texts such as Knuths' ``The Art of Computer Programming, vol 2.'' and the 
+Handbook of Applied Cryptography (\textit{HAC}) give considerably detailed explanations of the theoretical aspects of 
+the algorithms and very little regarding the practical aspects.  
+
+That is how an algorithm is explained and how it is actually implemented are two very different 
+realities.  For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple 
+precision integer addition.  However, what the description lacks is any discussion concerning the fact that the two 
+integer inputs may be of differing magnitudes.  Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) 
+does not discuss how to handle sign or handle the dividends decreasing magnitude in the main loop (\textit{Step \#3}).
+
+As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required 
+such as ``Comba'' and Karatsuba multipliers and fast modular inversion.  These optimal algorithms are considerably
+vital to achieve any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that 
+constitute a multiple precision integer package with light cursory discussions on the theoretical aspects.  As a case 
+study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate 
+algorithms with implementations that have been field tested and work very well.
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the 
+multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$.  The elements of the array $x$ are
+said to be the radix $\beta$ digits of the integer.  For example, $x = (15,0,7)_{\beta}$ would represent the 
+integer $15\cdot\beta^2 + 0\cdot\beta^1 + 7\cdot\beta^0$.  
+
+A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data
+required to manipulate the data.  These additional members are discussed in ~BASICOP~.  For the purposes of this text
+a ``multiple precision integer'' and a ``mp\_int'' are synonymous.
+
+\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word}
+For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while
+a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$.  Within the source code that will be
+presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a 
+double-precision type.  In several algorithms (\textit{notably the Comba routines}) temporary results 
+will be stored in a double-precision arrays.  For the purposes of this text $x_j$ will refer to the 
+$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision
+array.
+
+\subsection{Work Effort}
+\index{big-O}
+To measure the efficiency of various algorithms a modified big-O notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal multiplication requires $O(n^2)$ work while a squaring requires 
+$O({{n^2 + n}\over 2})$ work.  In standard big-O notation these would be said to be equivalent.  However, in the 
+context of the this text the magnitude of the inputs will not approach an infinite size.  This means the conventional limit 
+notation wisdom does not apply to the cancellation of constants.
+
+Throughout the discussions various ``work levels'' will be discussed.  These levels are the $O(1)$,
+$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts.  For example, operations at the $O(n^k)$ ``level'' are said to be
+executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$.  Obviously most optimizations will pay
+off the most at the higher levels since they represent the bulk of the effort required.  
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises.  These exercises are not 
+designed to be prize winning problems yet instead to be thought provoking.  Wherever possible the problems are foreward minded stating 
+problems that will be answered in subsequent chapters.  The reader is encouraged to finish the exercises as they appear to get a 
+better understanding of the subject material.  
+
+Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system.  However, unlike 
+\cite{TAOCPV2} the problems do not get nearly as hard as often.  The scoring of these exercises ranges from one (\textit{the easiest}) to
+five (\textit{the hardest}).  The following table sumarizes the scoring.
+
+\vspace{5mm}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                     & minutes to solve.  Usually does not involve much computer time. \\
+                     & \\
+$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+                     & \\
+$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+                     & \\
+$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research.  The solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+                     & \\
+$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial.  \\
+                     & Solutions to these problems will demonstrate a complete mastery \\
+                     & of the given subject. \\
+                     & \\
+\end{tabular}
+
+Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level are also
+designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  
+
+Problems at the third level are meant to be a bit more difficult.  Often the answer is fairly obvious but arriving at an exacting solution
+requires some thought and skill.  These problems will almost always involve devising a new algorithm or implementing a variation of
+another algorithm.
+
+Problems at the fourth level are meant to be even more difficult as well as involve some research.  The reader will most likely not know
+the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}).  Problems
+at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter.  People who can correctly 
+answer fifth level problems have a mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevence of problems.
+
+\chapter{Introduction to LibTomMath}
+
+\section{What is the LibTomMath?}
+LibTomMath is a free and open source multiple precision number theoretic library written in portable ISO C
+source code.  By portable it is meant that the library does not contain any code that is platform dependent or otherwise
+problematic to use on any given platform.  The library has been successfully tested under numerous operating systems 
+including Solaris, MacOS, Windows, Linux, PalmOS and on standalone hardware such as the Gameboy Advance.  The 
+library is designed to contain enough functionality to be able to develop number theoretic applications such as public 
+key cryptosystems.
+
+\section{Goals of the LibTomMath}
+
+Even though the library is written entirely in portable ISO C considerable care has been taken to 
+optimize the algorithm implementations within the library.  Specifically the code has been written to work well with
+the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors.  Wherever possible optimal 
+algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction.}) have 
+been provided to make the library as efficient as possible.  Even with the optimal and sometimes specialized 
+algorithms that have been included the API has been kept as simple as possible.  Often generic place holder routines 
+will make use of specialized algorithms automatically without the developers attention.  One such example
+is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use Karatsuba multiplication if the 
+inputs are of a specific size.
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.
+
+The project is also meant to act as a learning tool for students.  The logic being that no easy to follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  Often 
+where applicable routines have more comments than lines of code.
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision 
+integer arithmetic routines but would not be ideal for this text for numerous reasons as will be explained in the 
+following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily ascertain the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is also exceptionally well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very fast.  When compiled with GCC for the x86 processor the entire 
+library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}).  This includes every single function 
+LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various 
+reduction algorithms and Jacobi symbol computation.  
+
+By comparison MPI which has fewer number theoretic functions than LibTomMath compiled with the same conditions is 
+45,429 bytes (\textit{$54,536$ for ARMv4}).  GMP which has rather large collection of functions with the default 
+configuration on an x86 Athlon is 2,950,688 bytes.  Note that while LibTomMath has fewer functions than GMP it has been
+been used as the sole basis for several public key cryptosystems without having to seek additional outside functions
+to supplement the library.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names are relatively straight forward as to what they perform.  Almost all of the 
+functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing 
+convention.  The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the 
+student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this fault.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does
+feature a set of optimal algorithms for tasks ranging from modular reduction to squaring.  GMP and LIP also feature
+such optimizations while MPI only uses baseline algorithms with no optimizations.
+
+LibTomMath is almost always a magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI is not working on his library anymore.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, the 
+reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+MARK,BASICOP
+\section{Library Basics}
+To get the ``ball rolling'' so to speak a primitive data type and a series of primitive algorithms must be established.  First a data
+type that will hold the information required to maintain a multiple precision integer must be designed.  With this basic data type of a series
+of low level algorithms for initializing, clearing, growing and clamping integers can be developed to form the basis of the entire
+package of algorithms.
+
+\section{The mp\_int structure}
+First the data type for storing multiple precision integers must be designed.  This data type must be able to hold information to 
+maintain an array of digits, how many are actually used in the representation and the sign.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{verbatim}
+typedef struct  {
+    int used, alloc, sign;
+    mp_digit *dp;
+} mp_int;
+\end{verbatim}
+
+The \textbf{used} parameter denotes how many digits of the array \textbf{dp} are actually being used.  The array 
+\textbf{dp} holds the digits that represent the integer desired.  The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all LibTomMath routines will automatically increase the size of the 
+array to accommodate the precision of the result.  The \textbf{sign} parameter denotes the sign as either zero/positive 
+(\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+
+\section{Argument Passing}
+A convention of arugment passing must be adopted early on in the development of any library.  Making the function prototypes
+consistent will help eliminate many headaches in the future as the library grows to significant complexity.  In LibTomMath the multiple precision 
+integer functions accept parameters from left to right as pointers to mp\_int structures.  That means that the source operands are 
+placed on the left and the destination on the right.   Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around.  That is the destination
+on the left and arguments on the right.  In truth it is entirely a matter of preference.  
+
+Another very useful design consideration is whether to allow argument sources to also be a destination.  For example, the
+second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important feature to implement since it
+allows the higher up functions to cut down on the number of variables.  However, to implement this feature specific
+care has to be given to ensure the destination is not written before the source is fully read.
+
+\section{Return Values}
+A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the 
+caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour within reason.  In a multiple precision 
+library the only errors that are bound to occur are related to inappropriate inputs (\textit{division by zero for instance}) or 
+memory allocation errors.
+
+In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the 
+following values.
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+
+When an error is detected within a function it should free any memory they allocated and return as soon as possible.  The goal
+is to leave the system in the same state the system was when the function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %d\n", err);
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it is not ideal to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the integers.  These two functions will be used by far the most throughout the algorithms whenever 
+temporary integers are required.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even considering
+the initial integer will represent zero.  If only a single digit were allocated quite a few re-allocations
+would occur for the majority of inputs.  There exists a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state is to represent a zero integer the digits allocated must all be zeroed.  The
+\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+To initialize an mp\_int the mp\_init algorithm shall be used.  The purpose of this algorithm is to allocate 
+the memory required and initialize the integer to a default representation of zero.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory for the digits and set to a zero state. \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed then return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers.  It is ideally at least equal to $32$ but 
+can be any reasonable power of two.  Step one and two allocate the memory and account for it.  If the allocation fails the algorithm returns
+immediately to signal the failure.  Step three will ensure that all the digits are in the default state of zero.  Finally steps 
+four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure.
+
+EXAM,bn_mp_init.c
+
+The \textbf{OPT\_CAST} type cast on line @22,OPT_CAST@ is designed to allow C++ compilers to build the code out of
+the box.  Microsoft C V5.00 is known to cause problems without the cast.  Also note that if the memory
+allocation fails the other members of the mp\_int will be in an undefined state.  The code from 
+line @29,a->used@ to line @31,a->sign@ sets the default state for a mp\_int which is zero, positive and no used digits.
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with 
+the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ is cleared. \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  Free the digits of $a$ and mark $a$ as freed. \\
+3.  $a.used \leftarrow 0$ \\
+4.  $a.alloc \leftarrow 0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+In steps one and two the memory for the digits are only free'd if they had not been previously released before.  
+This is more of concern for the implementation since it is used to prevent ``double-free'' errors.  It also helps catch
+code errors where mp\_ints are used after being cleared.  Simiarly steps three and four set the 
+\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging.  For example, if an mp\_int is expected
+to be non-zero and its \textbf{used} member observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been
+spotted.
+
+EXAM,bn_mp_clear.c
+
+The \textbf{if} statement on line @21,a->dp != NULL@ prevents the heap from being corrupted if a user double-frees an 
+mp\_int.  For example, a trivial case of this bug would be as follows.
+
+\begin{verbatim}
+mp_int a;
+mp_init(&a);
+mp_clear(&a);
+mp_clear(&a);
+\end{verbatim}
+
+Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C
+libraries to cause a fault.  Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently 
+free the mp\_int before it is truly not needed.  The allocated digits are set to zero before being freed on line @24,memset@.  
+This is ideal for cryptographic situations where the mp\_int is a secret parameter.
+
+The following snippet is an example of using both the init and clear functions.  
+
+\begin{small}
+\begin{verbatim}
+#include <tommath.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main(void)
+{
+   mp_int num;
+   int err;
+   
+   /* init the bignum */
+   if ((err = mp_init(&num)) != MP_OKAY) {
+      printf("Error: %d\n", err);
+      return EXIT_FAILURE;
+   }
+   
+   /* do work with it ... */
+   
+   /* clear up */
+   mp_clear(&num);
+   
+   return EXIT_SUCCESS;
+}
+\end{verbatim}
+\end{small}
+
+\section{Other Initialization Routines}
+
+It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms.  For example, an 
+initialization followed by a copy is a common operation when temporary copies of integers are required.  It is quite
+beneficial to have a series of simple helper functions available.
+
+\subsection{Initializing Variable Sized mp\_int Structures}
+Occasionally the number of digits required will be known in advance of an initialization.  In these
+cases the mp\_init\_size algorithm can be of use.  The purpose of this algorithm is similar to mp\_init except that 
+it will allocate \textit{at least} a specified number of digits.  This is ideal to prevent re-allocations when the 
+input size is known.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$\\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  If the allocation failed then return(\textit{MP\_MEM}). \\
+5.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}5.1  $a_n \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$\\
+7.  $a.used \leftarrow 0$\\
+8.  $a.alloc \leftarrow v$\\
+9.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding.  The padding is calculated
+to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}.  This padding is used to 
+prevent trivial allocations from becomming a bottleneck in the rest of the algorithms that depend on this.
+
+EXAM,bn_mp_init_size.c
+
+Line @23,MP_PREC@ will ensure that the number of digits actually allocated is padded up to the next multiple of 
+\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}.  This ensures that the number of allocated digit is 
+always greater than the amount requested.  As a result it prevents many trivial memory allocations.  The value of 
+\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two.
+
+\subsection{Creating a Clone}
+Another common sequence of operations is to make a local temporary copy of an argument.  To initialize then copy a mp\_int will be known as 
+creating a clone.  This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy.  
+The mp\_init\_copy algorithm will perform this very task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{hint: use mp\_init}) \\
+2.  If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\
+3.  Copy $b$ to $a$.  (\textit{hint: use mp\_copy}) \\
+4.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it.  The algorithm will
+detect when the initialization fails and returns the error to the calling algorithm.  As such this algorithm will perform two operations
+in one step.  
+
+EXAM,bn_mp_init_copy.c
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\subsection{Multiple Integer Initializations}
+Occasionally a function will require a series of mp\_int data types to be made available.  The mp\_init\_multi algorithm
+is provided to simplify such cases.  The purpose of this algorithm is to initialize a variable length array of mp\_int 
+structures at once.  As a result algorithms that require multiple integers only has to use 
+one algorithm to initialize all the mp\_int variables.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each each mp\_int is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the $n$'th mp\_int (\textit{hint: use mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the $j$'th mp\_int (\textit{hint: use mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  As soon as an runtime error is detected (\textit{step 1.2}) all of
+the previously initialized variables are cleared.  The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime 
+errors.
+
+\subsection{Multiple Integer Clearing}
+Similarly to clear a variable length list of mp\_int structures the mp\_clear\_multi algorithm will be used.
+
+EXAM,bn_mp_multi.c
+
+Consider the following snippet which demonstrates how to use both routines.
+\begin{small}
+\begin{verbatim}
+#include <tommath.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main(void)
+{
+   mp_int num1, num2, num3;
+   int err;
+   
+   if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) {
+      printf("Error: %d\n", err);
+      return EXIT_FAILURE;
+   }
+   
+   /* at this point num1/num2/num3 are ready */
+   
+   /* free them */
+   mp_clear_multi(&num1, &num2, &num3, NULL);
+   
+   return EXIT_SUCCESS;
+}
+\end{verbatim}
+\end{small}
+
+\section{Maintenance}
+A small useful collection of mp\_int maintenance functions will also prove useful.  
+
+\subsection{Augmenting Integer Precision}
+When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without
+loss of precision.  Quite often the size of the array given by the \textbf{alloc} member is large enough to simply
+increase the \textbf{used} digit count.  However, when the size of the array is too small it must be re-sized 
+appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-Allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+Step one will prevent a re-allocation from being performed if it was not required.  This is useful to prevent mp\_ints
+from growing excessively in code that erroneously calls mp\_grow.  Similar to mp\_init\_size the requested digit count
+is padded to provide more digits than requested.  
+
+In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact.  Much akin to how the 
+\textit{realloc} function from the standard C library works.  Since the newly allocated digits are assumed to contain
+undefined values they are also initially zeroed.
+
+EXAM,bn_mp_grow.c
+
+The first step is to see if we actually need to perform a re-allocation at all.  This is tested for on line 
+@24,a->alloc < size@.  Similar to mp\_init\_size the same code on line @26,MP_PREC - 1@ was used to resize the 
+digits requested.  A simple for loop from line @34,a->alloc@ to line @38,}@ will zero all digits that were above the 
+old \textbf{alloc} limit to make sure the integer is in a known state.
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function.  For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most 
+$i + j + 1$ digits.  It is entirely possible that the result is $i + j$ though, with no final carry into the last 
+position.  However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j$
+digits than further expanded to accomodate the final carry.  That would be a considerable waste of time since heap
+operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j + 1$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim leading zeros by decrementing the 
+\textbf{used} count until a non-zero leading digit is found.  Also in this system, zero is considered to be a positive 
+number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is indended to be iterate only once or twice at
+the most.  For example, for cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+EXAM,bn_mp_clamp.c
+
+Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+\chapter{Basic Operations}
+\section{Copying an Integer}
+After the various house-keeping routines are in place, simpl algorithms can be designed to take advantage of them.  Being able
+to make a verbatim copy of an integer is a very useful function to have.  To copy an integer the mp\_copy algorithm will be used.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  Check if $a$ and $b$ point to the same location in memory. \\
+2.  If true then return(\textit{MP\_OKAY}). \\
+3.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{hint: use mp\_grow}) \\
+4.  If failed to grow then return(\textit{MP\_MEM}). \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $b_{n} \leftarrow a_{n}$ \\
+6.  if $a.used < b.used - 1$ then \\ 
+\hspace{3mm}6.1.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{6mm}6.1.1  $b_{n} \leftarrow 0$ \\
+7.  $b.used \leftarrow a.used$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+Step 1 and 2 make sure that the two mp\_ints are unique.  This allows the user to call the copy function with
+potentially the same input and not waste time.  Step 3 and 4 ensure that the destination is large enough to
+hold a copy of the input $a$.  Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used}
+member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller.  This
+prevents trivial memory reallocations.
+
+Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$,
+the leading digits of $b$ will be zeroed.  Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over 
+which completes the copy operation.
+
+EXAM,bn_mp_copy.c
+
+Source lines @23,if dst ==@-@31,}@ do the initial house keeping.  That is to see if the input is unique and if so to 
+make sure there is enough room.  If not enough space is available it returns the error and leaves the destination variable
+intact.
+
+The inner loop of the copy operation is contained between lines @34,{@ and @50,}@.  Many LibTomMath routines are designed with this source code style
+in mind, making aliases to shorten lengthy pointers (\textit{see line @38,->@ and @39,->@}) for rapid to use.  Also the
+use of nested braces creates a simple way to denote various portions of code that reside on various work levels.  Here, the copy loop is at the 
+$O(n)$ level.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+EXAM,bn_mp_zero.c
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{hint: use mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  As can be expected the algorithm is very trivial.
+
+EXAM,bn_mp_abs.c
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{hint: use mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}3.1  $b.sign = MP\_NEG$. \\
+4.  else do \\
+\hspace{3mm}4.1  $b.sign = MP\_ZPOS$. \\
+5.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  
+
+EXAM,bn_mp_neg.c
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{hint: use mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+EXAM,bn_mp_set.c
+
+Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ actually copies digit 
+into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
+reduce an integer modulo $\beta$.  Since $\beta = 2^k$ it suffices to perform a binary AND with $MP\_MASK = 2^k - 1$ to perform
+the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the digit actually set. This function 
+will always make the integer positive.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  The define \textbf{DIGIT\_BIT} in ``tommath.h'' 
+defines how many bits per digit are available.  Generally at least seven bits are guaranteed to be available per 
+digit.  This means that trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{hint: use mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{hint: use mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + \lfloor 32 / lg(\beta) \rfloor + 1$ \\
+3.  Clamp excess used digits (\textit{hint: use mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits.  In step 2.2 the
+next four bits from the source are extracted.  The four bits are added to the mp\_int and the \textbf{used} digit count is 
+incremented.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+EXAM,bn_mp_set_int.c
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ 
+as well as the  call to mp\_clamp() on line @40,mp_clamp@.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared and no difference found the algorithm simply returns \textbf{MP\_EQ}.
+
+EXAM,bn_mp_cmp_mag.c
+
+The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
+are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
+without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ passed the end of the 
+array of digits.
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{hint: use mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+EXAM,bn_mp_cmp.c
+
+The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
+the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Building Blocks}
+At this point algorithms for initialization, de-initialization, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are the addition, subtraction and digit movement algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multipliers.  It is very important that these 
+algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+MARK,SHIFTS
+All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}).  
+Mathematically a logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{hint: use mp\_grow}) \\
+4.  If failed to grow $c$ return(\textit{MP\_MEM}) \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow max + 1$ \\
+7.  $u \leftarrow 0$ \\
+8.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}8.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}8.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  if $min \ne max$ then do \\
+\hspace{+3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}9.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}9.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+10.  $c_{max} \leftarrow u$ \\
+11.  if $olduse > max$ then \\
+\hspace{+3mm}11.1  for $n$ from $max + 1$ to $olduse - 1$ do \\
+\hspace{+6mm}11.1.1  $c_n \leftarrow 0$ \\
+12.  Clamp excess digits in $c$.  (\textit{hint: use mp\_clamp}) \\
+13.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in \cite[pp. 266]{TAOCPV2} shares the same flaw as that from \cite{HAC}.  Even the MIX pseudo 
+machine code presented  \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count.  This allows the inputs to have varying magnitudes which not 
+only makes it more efficient than the trivial algorithm presented in the other references but more flexible.  The variable $min$ is given the lowest 
+digit count while $max$ is given the highest digit count.  If both inputs have the same \textbf{used} digit count both $min$ and $max$ are 
+set to the same.  The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it.  After the inputs are sorted steps 
+3 and 4 will ensure that the destination $c$ can accommodate the result.  The old \textbf{used} count from $c$ is copied to $oldused$ and the 
+new count is set to $max + 1$.  
+
+At step 7 the carry variable $u$ is set to zero and the first leg of the addition loop can begin.  The first step of the loop (\textit{8.1}) adds
+digits from the two inputs together along with the carry variable $u$.  The following step extracts the carry bit by shifting the result of the
+preceding step right $lg(\beta)$ positions.  The shift to extract the carry is similar to how carry extraction works with decimal addition.
+
+Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$.  The trailing digit of the result
+is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$.  The
+division and multiplication of $10$ is simply a logical shift right or left respectively of the digits.  In otherwords the carry can be extracted
+by shifting one digit to the right.
+
+Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$.  This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ 
+digit.  Therefore, a logical shift right of the single digit by $lg(\beta)$ will extract the carry.  The final step of the  loop reduces the digit 
+modulo the radix $\beta$ to ensure it is in range.
+
+After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted.  Step 9 decides whether
+the inputs were of equal magnitude.  If not than another loop similar to that in step 8 must be executed.  The loop at step
+number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry.  
+
+Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$.  Step 11 ensures that 
+leading digits that were originally present in $c$ are cleared.  Finally excess leading digits are clamped and the algorithm returns success.
+
+EXAM,bn_s_mp_add.c
+
+Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is pointer to a 
+mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
+accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases on 
+lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ are the for the two inputs and destination respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
+implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
+begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
+Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+MARK,GAMMA
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$.  It is allowable that a mp\_digit represent a larger range of values.  For this algorithm we will assume that
+the variable $\gamma$ represents the number of bits available in a mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{hint: use mp\_grow}) \\
+4.  If the reallocation failed return(\textit{MP\_MEM}). \\
+5.  $oldused \leftarrow c.used$ \\ 
+6.  $c.used \leftarrow max$ \\
+7.  $u \leftarrow 0$ \\
+8.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}8.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}8.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  if $min < max$ then do \\
+\hspace{3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}9.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+10. if $oldused > max$ then do \\
+\hspace{3mm}10.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11. Clamp excess digits of $c$.  (\textit{hint: use mp\_clamp}). \\
+12. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as oppose to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry within the subtraction loops.  Under the assumption
+that two's complement single precision arithmetic is used this will successfully extract the carry.  
+
+For example, consider subtracting $0101_2$ from
+$0100_2$ where $\gamma = 4$.  The least significant bit will force a carry upwards to the third bit which will be set to zero after the borrow.  After
+the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the third bit of $0101_2$ is subtracted from the result it will cause
+another carry.  In this case though the carry will be forced to propagate all the way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur it will propagate all the way to the most significant bit.  Therefore a single
+logical shift right by $\gamma - 1$ positions is sufficient to extract the carry.  This method of carry extraction may seem awkward but the reason for 
+it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+EXAM,bn_s_mp_sub.c
+
+Line @24,min@ and @25,max@ perform the initial hardcoded sorting.  In reality they are only aliases and are only used to make the source easier to 
+read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
+$a$, $b$ and $c$ respectively.
+
+The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
+the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
+(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
+the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
+occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
+shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
+twos compliment machines which is a safe assumption to make.
+
+If $a$ has a higher magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
+$a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight seperate cases which can be optimized down to three unique cases.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{hint: use s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  If any of the lower level operations failed return(\textit{MP\_MEM}) \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from either \cite{TAOCPV2} or 
+\cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly straightforward but restricted since subtraction can only 
+produce positive results.  Consider the following chart of possible inputs.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\end{figure}
+
+The chart lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled.  The 
+return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors.  This simpliies the description
+of the algorithm considerably and best follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.  
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+EXAM,bn_mp_add.c
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{hint: use s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  If any of the lower level operations failed return(\textit{MP\_MEM}). \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  The following chart lists the eight possible inputs and
+the operations required.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+EXAM,bn_mp_sub.c
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+MARK,POLY
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{hint: use mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $r \ne 0$ then do \\
+\hspace{3mm}7.1  $b_{a.used} = 1$ \\
+\hspace{3mm}7.2  $b.used \leftarrow b.used + 1$ \\
+8.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}8.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}8.1.1  $b_n \leftarrow 0$ \\
+9.  $b.sign \leftarrow a.sign$ \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to one and augmenting the \textbf{used} count.  Step 8 clears
+any original leading digits of $b$.
+
+EXAM,bn_mp_mul_2.c
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{hint: use mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading passed the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+EXAM,bn_mp_div_2.c
+
+\section{Polynomial Basis Operations}
+Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (Multiply by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{hint: use mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a seperate location.  The algorithm
+will return success immediately if $b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+FIGU,sliding_window,Sliding Window Movement
+
+EXAM,bn_mp_lshd.c
+
+The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
+the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $tmpa$ on line @42,tmpa@ is an alias
+for the leading digit while $tmpaa$ on line @45,tmpaa@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
+over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{hint: use mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  Clamp excess digits.  (\textit{hint: use mp\_clamp}). \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed.  Finally algorithm mp\_clamp is used to trim excess digits.
+
+EXAM,bn_mp_rshd.c
+
+The only noteworthy element of this routine is the lack of a return type.  This function cannot fail and as such it is more optimal to not
+return anything.
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{hint: use mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{hint: use mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+The logarithm of the residue is calculated on step 5.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+EXAM,bn_mp_mul_2d.c
+
+Notes to be revised when code is updated. -- Tom
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{hint: use mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{hint: use mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{hint: use mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+EXAM,bn_mp_div_2d.c
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d = a$ to be true without overwriting the input before they are no longer required.  
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{hint: use mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+EXAM,bn_mp_mod_2d.c
+
+-- Add comments later, Tom.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 1 \right ] $ & There exists an improvement on the previous algorithm to \\
+                      & slightly reduce the number of additions required.  Modify the \\
+                      & previous algorithm to include this improvement. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic systems including public key cryptographic algorithms the set of algorithms collectively known as the
+``multipliers'' form the most important subset of algorithms of any multiple precision integer package.  The set of multipliers 
+include multiplication, squaring and modular reduction algorithms.  
+
+The importance of these algorithms is driven by the fact that most popular public key algorithms are based on modular 
+exponentiation.  That is performing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  Roughly
+speaking the a modular exponentiation will spend about 40\% of the time in modular reductions, 35\% of the time in squaring and 25\% of
+the time in multiplications.  Only a small trivial amount of time is spent on lower level algorithms such as mp\_clamp, mp\_init, etc...
+
+This chapter will discuss only two of the multipliers algorithms, multiplication and squaring.  As will be discussed shortly very efficient
+multiplier algorithms are not always straightforward and deserve a lot of attention.
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm school children are taught.  The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm only called
+when the faster algorithms cannot be used.  This algorithm does not use any particularly interesting optimizations.
+
+The first algorithm to review is the unsigned multiplication algorithm from which a signed multiplication algorithm can be established.  One important 
+facet of this algorithm to note is that it has been modified to only produce a certain amount of output digits as resolution.  Recall that for
+a $n$ and $m$ digit input the product will be at most $n + m + 1$ digits.  Therefore, this algorithm can be reduced to a full multiplier by
+telling it to produce $n + m + 1$ digits.  
+
+Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend this variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method.  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + iy < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $c$ limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description the usefulness of partial multipliers will arise in a future 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M \cite[pp. 268]{TAOCPV2}.  The
+algorithm differs from those cited references because it can produce a variable output precision regardless of the precision of the inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   That is if the minimal digit count of either
+input is less than $\delta$ the Comba method is used.    After the Comba method is ruled out the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb < 0$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplication.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized as the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 0 & 4 & 0 & $10^1(4)(576)$ \\
+  1 & 1 & 5 & 2 & 0 & 0 &  $10^2(2)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat x$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is forwarded through the nested loop.  If the carry was ignored it would overflow the single precision digit $t_{ix+iy}$ and the result
+would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  That is provided $ix + iy < digs$ otherwise the
+carry is ignored since it will not be part of the result anyways.  
+
+EXAM,bn_s_mp_mul_digs.c
+
+Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
+the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By
+default it is set to $\delta$ but can be reduced when memory is at a premium.
+
+Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
+variables are cast to the type \textbf{mp\_word}.  That is to ensure that double precision operations are used instead of single precision.  The
+multiplication on line @65,) * (@ is a bit of a GCC optimization.  On the outset it looks like the compiler will have to use a double precision
+multiplication to produce the result required.  Such an operation would be horribly slow on most processors and drag this to a crawl.  However,
+GCC is smart enough to realize that double wide output single precision multipliers can be used.  For example, the instruction ``MUL'' on the
+x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+MARK,COMBA
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
+makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' method is named after little known 
+(\textit{in cryptographic venues}) Paul G. Comba where in \cite{COMBA} a method of implementing fast multipliers that do not require nested 
+carry fixup operations was presented.
+
+At the heart of algorithm is once again the long-hand algorithm for multiplication.  Except in this case a slight twist is placed on how
+the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
+final result.  In the baseline algorithm the columns are added together to get the result instantaneously.  
+
+In the Comba algorithm however, the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
+simple multiplication and addition step is performed.  Or more succintly that 
+
+\begin{equation}
+x_n = \sum_{i+j = n} a_ib_j
+\end{equation}
+
+Where $x_n$ is the $n'th$ column of the output vector.  To see how this works consider once again multiplying $576$ by $241$.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The following trivial algorithm will accomplish this.
+
+\begin{enumerate}
+    \item for $n$ from 0 to $k - 1$ do
+    \item \hspace{3mm} $x_{n+1} \leftarrow x_{n+1} + \lfloor x_{n}/\beta \rfloor$ 
+    \item \hspace{3mm} $x_{n} \leftarrow x_{n} \mbox{ (mod }\beta\mbox{)}$
+\end{enumerate}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $y = \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to a each column of the output 
+independently.  A serious obstacle is if the carry is lost due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit input the maximal weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximal number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we may not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k \cdot \left (2^{2\rho} - 2^{\rho + 1} + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}, \alpha = 2^{64}$ which simplies to $72057593501057025 \cdot k < 2^{64}$ which when divided out
+result in $k < 257$.  This implies that the smallest input may not have more than $256$ digits if the Comba method is to be used in
+this configuration.  This is quite satisfactory for most applications since $256$ digits would be allow for numbers in the range of $2^{7168}$ 
+which is much larger than the typical $2^{100}$ to $2^{4000}$ range most public key cryptographic algorithms use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{hint: use mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+Zero the temporary array $\hat W$. \\
+3.  for $n$ from $0$ to $digs - 1$ do \\
+\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+\\
+Compute the columns. \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
+\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+\\
+Propagate the carries upwards. \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow digs$ \\
+7.  If $digs > 1$ then do \\
+\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
+\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
+\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
+8.  else do \\
+\hspace{3mm}8.1  $ix \leftarrow 0$ \\
+9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Zero excess digits. \\
+10.  If $digs < oldused$ then do \\
+\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excessive digits of $c$.  (\textit{hint: use mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
+essentially peforms the same calculation as algorithm s\_mp\_mul\_digs but much faster.
+
+The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
+unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated in place in $\hat W$.  
+
+The $O(n^2)$ loop on step four is where the Comba method starts to show through.  First there is no carry variable in the loop.  Second the
+double precision multiply and add step does not have a carry fixup of any sort.  In fact the nested loop is very simple and can be implemented
+in parallel.  
+
+What makes the Comba method so attractive is that the carry propagation only takes place outside the $O(n^2)$ nested loop.  For example, if the 
+cost in terms of time of a multiply and add is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method only requires $pn^2 + qn$ time, however, in practice 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and add operations in the nested loop in parallel.  
+
+The carry propagation loop on step 7 is fairly straightforward.  It could have been written phased the other direction, that is, to assign
+to $c_{ix}$ instead of $c_{ix-1}$ in each iteration.  However, it would still require pre-caution to make sure that $\hat W_{ix+1}$ is not beyond
+the \textbf{MP\_WARRAY} words set aside.  
+
+EXAM,bn_fast_s_mp_mul_digs.c
+
+The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
+implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
+In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+
+The inner loop on line @84,mp_word@ is where the algorithm will spend the majority of the time.  Which is why it has been stripped to the 
+bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiply and add amounts to at the very least five
+instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors it amounts to only three (\textit{one load, one store,
+one multiply-add}).   On both the x86 and ARMv4 processors GCC v3.2 does a very good job at unrolling the loop and scheduling it so there 
+are very few dependency stalls.
+
+In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
+baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
+digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
+be simultaneously used.  
+
+\subsection{Multiplication at New Bounds by Karatsuba Method}
+So far two methods of multiplication have been presented.  Both of the algorithms require asymptotically $O(n^2)$ time to multiply two $n$-digit 
+numbers together.  While the Comba method is much faster than the baseline algorithm it still requires far too much time to multiply 
+large inputs together.  In fact it was not until \cite{KARA} in 1962 that a faster algorithm had been proposed at all.
+
+The idea behind Karatsubas method is that an input can be represented in polynomial basis as two halves then multiplied.  For example, if 
+$f(x) = ax + b$ and $g(x) = cx + b$ then the product of the two polynomials $h(x) = f(x)g(x)$ will allow $h(\beta) = (f(\beta))(g(\beta))$.  
+
+So how does this help?  First expand the product $h(x)$.
+
+\begin{center}
+\begin{tabular}{rcl}
+$h(x)$ & $=$ & $f(x)g(x)$ \\
+       & $=$ & $(ax + b)(cx + d)$ \\
+       & $=$ & $acx^2 + adx + bcx + bd$ \\
+\end{tabular}
+\end{center}
+
+The next equation is a bit of genius on the part of Karatsuba.  He proved that the previous equation is equivalent to 
+
+\begin{equation}
+h(x) = acx^2 + ((a - c)(b - d) + bd + ac)x + bd
+\end{equation}
+
+Essentially the proof lies in some fairly light algebraic number theory (\textit{see \cite{KARAP} for details}) that is not important for
+the discussion.  At first glance it appears that the Karatsuba method is actually harder than the straight $O(n^2)$ approach.  
+However, further investigation will prove otherwise.  
+
+The first important observation is that both $f(x)$ and $g(x)$ are the polynomial basis representation of two-digit numbers.  This means that 
+$\left < a, b, c, d \right >$ are single digit values.  Using either the baseline or straight polynomial multiplication the old method requires
+$O \left (4(n/2)^2 \right ) = O(n^2)$ single precision multiplications.  Looking closer at Karatsubas equation there are only three unique multiplications 
+required which are $ac$, $bd$ and $(a - c)(b - d)$.  As a result only $O \left (3 \cdot (n/2)^2 \right ) = O \left ( {3 \over 4}n^2 \right )$ 
+multiplications are required.  
+
+So far the algorithm has been discussed from the point of view of ``two-digit'' numbers.  However, there is no reason why two digits implies a range of 
+$\beta^2$.  It could just as easily represent a range of $\left (\beta^k \right)^2$ as well.  For example, the polynomial 
+$f(x) = a_3x^3 + a_2x^2 + a_1x + a_0$ could also be written as $f'(x) = a'_1x + a'_0$ where $f(\beta) = f'(\beta^2)$.  Fortunately representing an
+integer which is already in an array of radix-$\beta$ digits in polynomial basis in terms of a power of $\beta$ is very simple.  
+
+\subsubsection{Recursion}
+The Karatsuba multiplication algorithm can be applied to practically any size of input.  Therefore, it is possible that the Karatsuba method itself
+be used for the three multiplications required.  For example, when multiplying two four-digit numbers there will be three multiplications of two-digit
+numbers.  In this case the smaller multiplication requires $p(n) = {3 \over 4}n^2$ time to complete while the larger multiplication requires
+$q(n) = 3 \cdot p(n/2)$ multiplications.  
+
+By expanding $q(n)$ the following equation is achieved. 
+
+\begin{center}
+\begin{tabular}{rcl}
+$q(n)$ & $=$ & $3 \cdot p(n/2)$ \\
+       & $=$ & $3 \cdot (3 \cdot ((n/2)/2)^2)$ \\
+       & $=$ & $9 \cdot (n/4)^2$ \\
+       & $=$ & ${9 \over 16}n^2$ \\
+\end{tabular}
+\end{center}
+
+The generic expression for the multiplicand is simply $\left ( {3 \over 4} \right )^k$ for $k \ge 1$ recurisions.  The maximal number of recursions
+is approximately $lg(n)$.  Putting this all in terms of a base $n$ logarithm the asymptotic running time can be deduced.
+
+\begin{center}
+\begin{tabular}{rcl}
+$lg_n \left ( \left ( {3 \over 4} \right )^{lg_2 n} \cdot n^2 \right )$ & $=$ & $lg_2 n \cdot lg_n \left ( { 3 \over 4 } \right ) + 2$ \\
+                                                                        & $=$ & $\left ( {log N \over log 2} \right ) \cdot \left ( {log \left ( {3 \over 4} \right ) \over log N } \right ) + 2$ \\
+                                                                        & $=$ & ${ log 3 - log 2^2 + 2 \cdot log 2} \over log 2$ \\
+                                                                        & $=$ & $log 3 \over log 2$ \\
+\end{tabular}
+\end{center}
+
+Which leads to a running time of $O \left ( n^{lg(3)} \right )$ which is approximately $O(n^{1.584})$.  This can lead to 
+impressive savings with fairly moderate sized numbers.  For example, when multiplying two 128-digit numbers the Karatsuba 
+method saves $14,197$ (\textit{or $86\%$ of the total}) single precision multiplications.  
+
+The immediate question becomes why not simply use Karatsuba multiplication all the time and forget about the baseline and Comba algorithms? 
+
+\subsubsection{Overhead}
+While the Karatsuba method saves on the number of single precision multiplications required this savings is not entirely free.  The product
+of three half size products must be stored somewhere as well as four additions and two subtractions performed.  These operations incur sufficient
+overhead that often for fairly trivial sized inputs the Karatsuba method is slower.
+
+\index{cutoff point}
+The \textit{cutoff point} for Karatsuba multiplication is the point at which the Karatsuba multiplication and baseline (\textit{or Comba}) meet.  
+For the purposes of this discussion call this value $x$.  For any input with $n$ digits such that $n < x$ Karatsuba multiplication will be slower 
+and for $n > x$ it will be faster.  Often the break between the two algorithms is not so clean cut in reality.  The cleaner the cut the more 
+efficient multiplication will be which is why tuning the multiplication is a very important process.  For example, a properly tuned Karatsuba 
+multiplication algorithm can multiply two $4,096$ bit numbers up to five times faster on an Athlon processor compared to the standard baseline
+algorithm.  
+
+The exact placement of the value of $x$ depends on several key factors.   The cost of allocating storage for the temporary variables, the cost of 
+performing the additions and most importantly the cost of performing a single precision multiplication.  With a processor where single precision 
+multiplication is fast\footnote{The AMD Athlon for instance has a six cycle multiplier compared to the Intel P4 which has a 15 cycle multiplier.} the 
+cutoff point will move upwards.  Similarly with a slower processor the cutoff point will move downwards.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+2.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+3.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{hint: use mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{hint: use mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{hint: use mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{hint: use mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+
+
+\section{Squaring}
+\subsection{The Baseline Squaring Algorithm}
+\subsection{Faster Squaring by the ``Comba'' Method}
+\subsection{Karatsuba Squaring}
+\section{Tuning Algorithms}
+\subsection{How to Tune Karatsuba Algorithms}
+
+\chapter{Modular Reductions}
+\section{Basics of Modular Reduction}
+\section{The Barrett Reduction}
+\section{The Montgomery Reduction}
+\subsection{Faster ``Comba'' Montgomery Reduction}
+\subsection{Example Montgomery Algorithms}
+\section{The Diminished Radix Algorithm}
+\section{Algorithm Comparison}
+
+\chapter{Exponentiation}
+\section{Single Digit Exponentiation}
+\section{Modular Exponentiation}
+\subsection{General Case}
+\subsection{Odd or Diminished Radix Moduli}
+\section{Quick Power of Two}
+
+\chapter{Higher Level Algorithms}
+\section{Integer Division with Remainder}
+\section{Single Digit Helpers}
+\subsection{Single Digit Addition}
+\subsection{Single Digit Subtraction}
+\subsection{Single Digit Multiplication}
+\subsection{Single Digit Division}
+\subsection{Single Digit Modulo}
+\subsection{Single Digit Root Extraction}
+\section{Random Number Generation}
+\section{Formatted Output}
+\subsection{Getting The Output Size}
+\subsection{Generating Radix-n Output}
+\subsection{Reading Radix-n Input}
+\section{Unformatted Output}
+\subsection{Getting The Output Size}
+\subsection{Generating Output}
+\subsection{Reading Input}
+
+\chapter{Number Theoretic Algorithms}
+\section{Greatest Common Divisor}
+\section{Least Common Multiple}
+\section{Jacobi Symbol Computation}
+\section{Modular Inverse}
+\subsection{General Case}
+\subsection{Odd Moduli}
+\section{Primality Tests}
+\subsection{Trial Division}
+\subsection{The Fermat Test}
+\subsection{The Miller-Rabin Test}
+\subsection{Primality Test in a Bottle}
+\subsection{The Next Prime}
+\section{Root Extraction}
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\chapter{Appendix}
+\subsection*{Appendix A -- Source Listing of tommath.h}
+
+The following is the source listing of the header file ``tommath.h'' for the LibTomMath project.  It contains many of 
+the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on.  The header is 
+presented here for completeness.
+
+LIST,tommath.h
+
+\end{document}
\ No newline at end of file
diff --git a/tommath.tex b/tommath.tex
new file mode 100644
index 0000000..ae4cb61
--- /dev/null
+++ b/tommath.tex
@@ -0,0 +1,4195 @@
+\documentclass[b5paper]{book}
+\usepackage{makeidx}
+\usepackage{amssymb}
+\usepackage{color}
+\usepackage{alltt}
+\usepackage{graphicx}
+\usepackage{layout}
+\def\union{\cup}
+\def\intersect{\cap}
+\def\getsrandom{\stackrel{\rm R}{\gets}}
+\def\cross{\times}
+\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
+\def\catn{$\|$}
+\def\divides{\hspace{0.3em} | \hspace{0.3em}}
+\def\nequiv{\not\equiv}
+\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
+\def\lcm{{\rm lcm}}
+\def\gcd{{\rm gcd}}
+\def\log{{\rm log}}
+\def\ord{{\rm ord}}
+\def\abs{{\mathit abs}}
+\def\rep{{\mathit rep}}
+\def\mod{{\mathit\ mod\ }}
+\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
+\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
+\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
+\def\Or{{\rm\ or\ }}
+\def\And{{\rm\ and\ }}
+\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
+\def\implies{\Rightarrow}
+\def\undefined{{\rm ``undefined"}}
+\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
+\let\oldphi\phi
+\def\phi{\varphi}
+\def\Pr{{\rm Pr}}
+\newcommand{\str}[1]{{\mathbf{#1}}}
+\def\F{{\mathbb F}}
+\def\N{{\mathbb N}}
+\def\Z{{\mathbb Z}}
+\def\R{{\mathbb R}}
+\def\C{{\mathbb C}}
+\def\Q{{\mathbb Q}}
+\definecolor{DGray}{gray}{0.5}
+\newcommand{\url}[1]{\mbox{$<${#1}$>$}}
+\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
+\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
+\def\gap{\vspace{0.5ex}}
+\makeindex
+\begin{document}
+\frontmatter
+\pagestyle{empty}
+\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - }
+\author{\mbox{
+%\begin{small}
+\begin{tabular}{c}
+Tom St Denis \\
+Algonquin College \\
+\\
+Mads Rasmussen \\
+Open Communications Security \\
+\\
+Gregory Rose \\
+Qualcomm \\
+\end{tabular}
+%\end{small}
+}
+}
+\maketitle
+This text in its entirety is copyrighted \copyright{}2003 by Tom St Denis.  It may not be redistributed 
+electronically or otherwise without the sole permission of the author.  The text is freely re distributable as long as
+it is packaged along with the LibTomMath project in a non-commercial project.  Contact the
+author for other redistribution rights.
+
+This text corresponds to the v0.17 release of the LibTomMath project.
+
+\begin{alltt}
+Tom St Denis
+111 Banning Rd
+Ottawa, Ontario
+K2L 1C3
+Canada
+
+Phone: 1-613-836-3160
+Email: tomstdenis@iahu.ca
+\end{alltt}
+
+This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
+{\em book} macro package and the Perl {\em booker} package.
+
+\tableofcontents
+\listoffigures
+\chapter*{Preface}
+Blah.
+
+\mainmatter
+\pagestyle{headings}
+\chapter{Introduction}
+\section{Multiple Precision Arithmetic}
+\subsection{The Need for Multiple Precision Arithmetic}
+The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public
+key cryptography.   Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to 
+resist known cryptanalytic attacks.  Typical modern programming languages such as C and Java only provide small 
+single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long.
+
+For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type.  With an 
+x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$.  The original inputs 
+were approximately $21$ and $24$ bits respectively.  If the C language cannot multiply two relatively small values 
+together precisely how does anyone expect it to multiply two values which are considerably larger?
+
+Most advancements in fast multiple precision arithmetic stems from the desire for faster cryptographic primitives.  However, cryptography
+is not the only field of study that can benefit fast large integer routines.  Another auxiliary use for multiple precision integers is 
+high precision floating point data types.  The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$.  
+Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is convention.  Since IEEE is meant to be implemented in 
+hardware the precision of the mantissa is often fairly small (\textit{roughly 23 bits}).  Since the mantissa is merely an 
+integer a large multiple precision integer could be used.  In effect very high precision floating point arithmetic 
+could be performed.  This would be useful where scientific applications must minimize the total output error over long simulations.  
+
+\subsection{Multiple Precision Arithmetic}
+\index{multiple precision}
+Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from
+the C and Java programming languages.  In essence multiple precision arithmetic is a set of operations that can be 
+performed on members of an algebraic group whose precision is not fixed.  The algorithms when implemented to be multiple
+precision can allow a developer to work with any practical precision required.
+
+Typically the arithmetic is performed over the ring of integers denoted by a $\Z$ and referred to casually as ``bignum'' 
+routines.  However, it is possible to have rings of polynomials as well typically denoted by $\Z/p\Z \left [ X \right ]$ 
+which could have variable precision (\textit{or degree}).  This text will discuss implementation of the former, however,
+implementing polynomial basis routines should be relatively easy after reading this text.
+
+\subsection{Benefits of Multiple Precision Arithmetic}
+\index{precision} \index{accuracy}
+Precision is defined loosely as the proximity to the real value a given representation is.  Accuracy is defined as the 
+reproducibility of the result.  For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided 
+it is reproducible.
+
+The benefit of multiple precision representations over single precision representations is that 
+often no precision is lost while representing the result of an operation which requires excess precision.  For example, 
+the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result.  A multiple precision 
+system would augment the precision of the destination to accomodate the result while a single precision system would
+truncate excess bits to maintain a fixed level of precision.
+
+Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of
+modest computer resources.  The only reasonable case where a multiple precision system will lose precision is when
+emulating a floating point data type.  However, with multiple precision integer arithmetic no precision is lost.
+
+\subsection{Basis of Operations}
+At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learnt as children 
+in grade school.  For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for 
+$1,234$ instead they are taught how to long-multiply.  That is to multiply each column using simple single digit 
+multiplications and add the resulting products by column.  The representation that most are familiar with is known as 
+decimal or formally as radix-10. A radix-$n$ representation simply means there are $n$ possible values per digit.  
+For example, binary would be a radix-2 representation.
+
+In essence computer based multiple precision arithmetic is very much the same.  The most notable difference is the usage
+of a binary friendly radix.  That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine 
+register.  Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and 
+squaring instead of traditional long-hand algorithms.
+
+\section{Purpose of This Text}
+The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms.  That is 
+to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by
+authors of other texts on the subject.  Texts such as Knuths' ``The Art of Computer Programming, vol 2.'' and the 
+Handbook of Applied Cryptography (\textit{HAC}) give considerably detailed explanations of the theoretical aspects of 
+the algorithms and very little regarding the practical aspects.  
+
+That is how an algorithm is explained and how it is actually implemented are two very different 
+realities.  For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple 
+precision integer addition.  However, what the description lacks is any discussion concerning the fact that the two 
+integer inputs may be of differing magnitudes.  Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) 
+does not discuss how to handle sign or handle the dividends decreasing magnitude in the main loop (\textit{Step \#3}).
+
+As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required 
+such as ``Comba'' and Karatsuba multipliers and fast modular inversion.  These optimal algorithms are considerably
+vital to achieve any form of useful performance in non-trivial applications.  
+
+To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that 
+constitute a multiple precision integer package with light cursory discussions on the theoretical aspects.  As a case 
+study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate 
+algorithms with implementations that have been field tested and work very well.
+
+\section{Discussion and Notation}
+\subsection{Notation}
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the 
+multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$.  The elements of the array $x$ are
+said to be the radix $\beta$ digits of the integer.  For example, $x = (15,0,7)_{\beta}$ would represent the 
+integer $15\cdot\beta^2 + 0\cdot\beta^1 + 7\cdot\beta^0$.  
+
+A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data
+required to manipulate the data.  These additional members are discussed in chapter three.  For the purposes of this text
+a ``multiple precision integer'' and a ``mp\_int'' are synonymous.
+
+\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word}
+For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while
+a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$.  Within the source code that will be
+presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a 
+double-precision type.  In several algorithms (\textit{notably the Comba routines}) temporary results 
+will be stored in a double-precision arrays.  For the purposes of this text $x_j$ will refer to the 
+$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision
+array.
+
+\subsection{Work Effort}
+\index{big-O}
+To measure the efficiency of various algorithms a modified big-O notation is used.  In this system all 
+single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
+That is a single precision addition, multiplication and division are assumed to take the same time to 
+complete.  While this is generally not true in practice it will simplify the discussions considerably.
+
+Some algorithms have slight advantages over others which is why some constants will not be removed in 
+the notation.  For example, a normal multiplication requires $O(n^2)$ work while a squaring requires 
+$O({{n^2 + n}\over 2})$ work.  In standard big-O notation these would be said to be equivalent.  However, in the 
+context of the this text the magnitude of the inputs will not approach an infinite size.  This means the conventional limit 
+notation wisdom does not apply to the cancellation of constants.
+
+Throughout the discussions various ``work levels'' will be discussed.  These levels are the $O(1)$,
+$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts.  For example, operations at the $O(n^k)$ ``level'' are said to be
+executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$.  Obviously most optimizations will pay
+off the most at the higher levels since they represent the bulk of the effort required.  
+
+\section{Exercises}
+Within the more advanced chapters a section will be set aside to give the reader some challenging exercises.  These exercises are not 
+designed to be prize winning problems yet instead to be thought provoking.  Wherever possible the problems are foreward minded stating 
+problems that will be answered in subsequent chapters.  The reader is encouraged to finish the exercises as they appear to get a 
+better understanding of the subject material.  
+
+Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system.  However, unlike 
+\cite{TAOCPV2} the problems do not get nearly as hard as often.  The scoring of these exercises ranges from one (\textit{the easiest}) to
+five (\textit{the hardest}).  The following table sumarizes the scoring.
+
+\vspace{5mm}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
+                     & minutes to solve.  Usually does not involve much computer time. \\
+                     & \\
+$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
+                     & time usage.  Usually requires a program to be written to \\
+                     & solve the problem. \\
+                     & \\
+$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
+                     & of work.  Usually involves trivial research and development of \\
+                     & new theory from the perspective of a student. \\
+                     & \\
+$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
+                     & of work and research.  The solution to which will demonstrate \\
+                     & a higher mastery of the subject matter. \\
+                     & \\
+$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial.  \\
+                     & Solutions to these problems will demonstrate a complete mastery \\
+                     & of the given subject. \\
+                     & \\
+\end{tabular}
+
+Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
+devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level are also
+designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  
+
+Problems at the third level are meant to be a bit more difficult.  Often the answer is fairly obvious but arriving at an exacting solution
+requires some thought and skill.  These problems will almost always involve devising a new algorithm or implementing a variation of
+another algorithm.
+
+Problems at the fourth level are meant to be even more difficult as well as involve some research.  The reader will most likely not know
+the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}).  Problems
+at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter.  People who can correctly 
+answer fifth level problems have a mastery of the subject matter at hand.
+
+Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
+is encouraged to answer the follow-up problems and try to draw the relevence of problems.
+
+\chapter{Introduction to LibTomMath}
+
+\section{What is the LibTomMath?}
+LibTomMath is a free and open source multiple precision number theoretic library written in portable ISO C
+source code.  By portable it is meant that the library does not contain any code that is platform dependent or otherwise
+problematic to use on any given platform.  The library has been successfully tested under numerous operating systems 
+including Solaris, MacOS, Windows, Linux, PalmOS and on standalone hardware such as the Gameboy Advance.  The 
+library is designed to contain enough functionality to be able to develop number theoretic applications such as public 
+key cryptosystems.
+
+\section{Goals of the LibTomMath}
+
+Even though the library is written entirely in portable ISO C considerable care has been taken to 
+optimize the algorithm implementations within the library.  Specifically the code has been written to work well with
+the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors.  Wherever possible optimal 
+algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction.}) have 
+been provided to make the library as efficient as possible.  Even with the optimal and sometimes specialized 
+algorithms that have been included the API has been kept as simple as possible.  Often generic place holder routines 
+will make use of specialized algorithms automatically without the developers attention.  One such example
+is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use Karatsuba multiplication if the 
+inputs are of a specific size.
+
+Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
+be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
+MPI library was used as a API template for all the basic functions.
+
+The project is also meant to act as a learning tool for students.  The logic being that no easy to follow ``bignum'' 
+library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
+arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  Often 
+where applicable routines have more comments than lines of code.
+
+\section{Choice of LibTomMath}
+LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
+for more worthy reasons.  Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision 
+integer arithmetic routines but would not be ideal for this text for numerous reasons as will be explained in the 
+following sub-sections.
+
+\subsection{Code Base}
+The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
+segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
+developer can more readily ascertain the true intent of a given section of source code without trying to keep track of
+what conditional code will be used.
+
+The code base of LibTomMath is also exceptionally well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very fast.  When compiled with GCC for the x86 processor the entire 
+library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}).  This includes every single function 
+LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various 
+reduction algorithms and Jacobi symbol computation.  
+
+By comparison MPI which has fewer number theoretic functions than LibTomMath compiled with the same conditions is 
+45,429 bytes (\textit{$54,536$ for ARMv4}).  GMP which has rather large collection of functions with the default 
+configuration on an x86 Athlon is 2,950,688 bytes.  Note that while LibTomMath has fewer functions than GMP it has been
+been used as the sole basis for several public key cryptosystems without having to seek additional outside functions
+to supplement the library.
+
+\subsection{API Simplicity}
+LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
+with LibTomMath without change. The function names are relatively straight forward as to what they perform.  Almost all of the 
+functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing 
+convention.  The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the 
+student and developer alike.  
+
+The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
+illegible short hand.  LibTomMath does not share this fault.
+
+\subsection{Optimizations}
+While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does
+feature a set of optimal algorithms for tasks ranging from modular reduction to squaring.  GMP and LIP also feature
+such optimizations while MPI only uses baseline algorithms with no optimizations.
+
+LibTomMath is almost always a magnitude faster than the MPI library at computationally expensive tasks such as modular
+exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
+slower than the best libraries such as GMP and OpenSSL by a small factor.
+
+\subsection{Portability and Stability}
+LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
+(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
+variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
+MPI is not working on his library anymore.  
+
+GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
+development and are very stable across a variety of platforms.
+
+\subsection{Choice}
+LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
+the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, the 
+reader is encouraged to download their own copy of the library to actually be able to work with the library.  
+
+\chapter{Getting Started}
+\section{Library Basics}
+To get the ``ball rolling'' so to speak a primitive data type and a series of primitive algorithms must be established.  First a data
+type that will hold the information required to maintain a multiple precision integer must be designed.  With this basic data type of a series
+of low level algorithms for initializing, clearing, growing and clamping integers can be developed to form the basis of the entire
+package of algorithms.
+
+\section{The mp\_int structure}
+First the data type for storing multiple precision integers must be designed.  This data type must be able to hold information to 
+maintain an array of digits, how many are actually used in the representation and the sign.  The ISO C standard does not provide for 
+any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
+used within LibTomMath.
+
+\index{mp\_int}
+\begin{verbatim}
+typedef struct  {
+    int used, alloc, sign;
+    mp_digit *dp;
+} mp_int;
+\end{verbatim}
+
+The \textbf{used} parameter denotes how many digits of the array \textbf{dp} are actually being used.  The array 
+\textbf{dp} holds the digits that represent the integer desired.  The \textbf{alloc} parameter denotes how 
+many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
+of a result would exceed the \textbf{alloc} count all LibTomMath routines will automatically increase the size of the 
+array to accommodate the precision of the result.  The \textbf{sign} parameter denotes the sign as either zero/positive 
+(\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
+
+\section{Argument Passing}
+A convention of arugment passing must be adopted early on in the development of any library.  Making the function prototypes
+consistent will help eliminate many headaches in the future as the library grows to significant complexity.  In LibTomMath the multiple precision 
+integer functions accept parameters from left to right as pointers to mp\_int structures.  That means that the source operands are 
+placed on the left and the destination on the right.   Consider the following examples.
+
+\begin{verbatim}
+   mp_mul(&a, &b, &c);   /* c = a * b */
+   mp_add(&a, &b, &a);   /* a = a + b */
+   mp_sqr(&a, &b);       /* b = a * a */
+\end{verbatim}
+
+The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
+functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
+
+Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around.  That is the destination
+on the left and arguments on the right.  In truth it is entirely a matter of preference.  
+
+Another very useful design consideration is whether to allow argument sources to also be a destination.  For example, the
+second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important feature to implement since it
+allows the higher up functions to cut down on the number of variables.  However, to implement this feature specific
+care has to be given to ensure the destination is not written before the source is fully read.
+
+\section{Return Values}
+A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the 
+caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour within reason.  In a multiple precision 
+library the only errors that are bound to occur are related to inappropriate inputs (\textit{division by zero for instance}) or 
+memory allocation errors.
+
+In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the 
+following values.
+
+\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{center}
+\begin{tabular}{|l|l|}
+\hline \textbf{Value} & \textbf{Meaning} \\
+\hline \textbf{MP\_OKAY} & The function was successful \\
+\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
+\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
+\hline
+\end{tabular}
+\end{center}
+
+When an error is detected within a function it should free any memory they allocated and return as soon as possible.  The goal
+is to leave the system in the same state the system was when the function was called.  Error checking with this style of API is fairly simple.
+
+\begin{verbatim}
+   int err;
+   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
+      printf("Error: %d\n", err);
+      exit(EXIT_FAILURE);
+   }
+\end{verbatim}
+
+The GMP library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
+and it is not ideal to force developers to have signal handlers for such cases.
+
+\section{Initialization and Clearing}
+The logical starting point when actually writing multiple precision integer functions is the initialization and 
+clearing of the integers.  These two functions will be used by far the most throughout the algorithms whenever 
+temporary integers are required.
+
+Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
+the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even considering
+the initial integer will represent zero.  If only a single digit were allocated quite a few re-allocations
+would occur for the majority of inputs.  There exists a tradeoff between how many default digits to allocate
+and how many re-allocations are tolerable.  
+
+If the memory for the digits has been successfully allocated then the rest of the members of the structure must
+be initialized.  Since the initial state is to represent a zero integer the digits allocated must all be zeroed.  The
+\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
+
+\subsection{Initializing an mp\_int}
+To initialize an mp\_int the mp\_init algorithm shall be used.  The purpose of this algorithm is to allocate 
+the memory required and initialize the integer to a default representation of zero.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Allocate memory for the digits and set to a zero state. \\
+\hline \\
+1.  Allocate memory for \textbf{MP\_PREC} digits. \\
+2.  If the allocation failed then return(\textit{MP\_MEM}) \\
+3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$\\
+4.  $a.sign \leftarrow MP\_ZPOS$\\
+5.  $a.used \leftarrow 0$\\
+6.  $a.alloc \leftarrow MP\_PREC$\\
+7.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init}
+\end{figure}
+
+\textbf{Algorithm mp\_init.}
+The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers.  It is ideally at least equal to $32$ but 
+can be any reasonable power of two.  Step one and two allocate the memory and account for it.  If the allocation fails the algorithm returns
+immediately to signal the failure.  Step three will ensure that all the digits are in the default state of zero.  Finally steps 
+four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure.
+
+\index{bn\_mp\_init.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init a new bigint */
+018   int
+019   mp_init (mp_int * a)
+020   \{
+021     /* allocate ram required and clear it */
+022     a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
+023     if (a->dp == NULL) \{
+024       return MP_MEM;
+025     \}
+026   
+027     /* set the used to zero, allocated digit to the default precision
+028      * and sign to positive */
+029     a->used  = 0;
+030     a->alloc = MP_PREC;
+031     a->sign  = MP_ZPOS;
+032   
+033     return MP_OKAY;
+034   \}
+\end{alltt}
+\end{small}
+
+The \textbf{OPT\_CAST} type cast on line 22 is designed to allow C++ compilers to build the code out of
+the box.  Microsoft C V5.00 is known to cause problems without the cast.  Also note that if the memory
+allocation fails the other members of the mp\_int will be in an undefined state.  The code from 
+line 29 to line 31 sets the default state for a mp\_int which is zero, positive and no used digits.
+
+\subsection{Clearing an mp\_int}
+When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with 
+the mp\_clear algorithm.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clear}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  The memory for $a$ is cleared. \\
+\hline \\
+1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
+2.  Free the digits of $a$ and mark $a$ as freed. \\
+3.  $a.used \leftarrow 0$ \\
+4.  $a.alloc \leftarrow 0$ \\
+5.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clear}
+\end{figure}
+
+\textbf{Algorithm mp\_clear.}
+In steps one and two the memory for the digits are only free'd if they had not been previously released before.  
+This is more of concern for the implementation since it is used to prevent ``double-free'' errors.  It also helps catch
+code errors where mp\_ints are used after being cleared.  Simiarly steps three and four set the 
+\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging.  For example, if an mp\_int is expected
+to be non-zero and its \textbf{used} member observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been
+spotted.
+
+\index{bn\_mp\_clear.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* clear one (frees)  */
+018   void
+019   mp_clear (mp_int * a)
+020   \{
+021     if (a->dp != NULL) \{
+022   
+023       /* first zero the digits */
+024       memset (a->dp, 0, sizeof (mp_digit) * a->used);
+025   
+026       /* free ram */
+027       free (a->dp);
+028   
+029       /* reset members to make debugging easier */
+030       a->dp = NULL;
+031       a->alloc = a->used = 0;
+032     \}
+033   \}
+\end{alltt}
+\end{small}
+
+The \textbf{if} statement on line 21 prevents the heap from being corrupted if a user double-frees an 
+mp\_int.  For example, a trivial case of this bug would be as follows.
+
+\begin{verbatim}
+mp_int a;
+mp_init(&a);
+mp_clear(&a);
+mp_clear(&a);
+\end{verbatim}
+
+Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C
+libraries to cause a fault.  Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently 
+free the mp\_int before it is truly not needed.  The allocated digits are set to zero before being freed on line 24.  
+This is ideal for cryptographic situations where the mp\_int is a secret parameter.
+
+The following snippet is an example of using both the init and clear functions.  
+
+\begin{small}
+\begin{verbatim}
+#include <tommath.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main(void)
+{
+   mp_int num;
+   int err;
+   
+   /* init the bignum */
+   if ((err = mp_init(&num)) != MP_OKAY) {
+      printf("Error: %d\n", err);
+      return EXIT_FAILURE;
+   }
+   
+   /* do work with it ... */
+   
+   /* clear up */
+   mp_clear(&num);
+   
+   return EXIT_SUCCESS;
+}
+\end{verbatim}
+\end{small}
+
+\section{Other Initialization Routines}
+
+It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms.  For example, an 
+initialization followed by a copy is a common operation when temporary copies of integers are required.  It is quite
+beneficial to have a series of simple helper functions available.
+
+\subsection{Initializing Variable Sized mp\_int Structures}
+Occasionally the number of digits required will be known in advance of an initialization.  In these
+cases the mp\_init\_size algorithm can be of use.  The purpose of this algorithm is similar to mp\_init except that 
+it will allocate \textit{at least} a specified number of digits.  This is ideal to prevent re-allocations when the 
+input size is known.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_size}. \\
+\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$\\
+\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
+\hline \\
+1.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+3.  Allocate $v$ digits. \\
+4.  If the allocation failed then return(\textit{MP\_MEM}). \\
+5.  for $n$ from $0$ to $v - 1$ do \\
+\hspace{3mm}5.1  $a_n \leftarrow 0$ \\
+6.  $a.sign \leftarrow MP\_ZPOS$\\
+7.  $a.used \leftarrow 0$\\
+8.  $a.alloc \leftarrow v$\\
+9.  Return(\textit{MP\_OKAY})\\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_size}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_size.}
+The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding.  The padding is calculated
+to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}.  This padding is used to 
+prevent trivial allocations from becomming a bottleneck in the rest of the algorithms that depend on this.
+
+\index{bn\_mp\_init\_size.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* init a mp_init and grow it to a given size */
+018   int
+019   mp_init_size (mp_int * a, int size)
+020   \{
+021   
+022     /* pad size so there are always extra digits */
+023     size += (MP_PREC * 2) - (size & (MP_PREC - 1));    
+024     
+025     /* alloc mem */
+026     a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
+027     if (a->dp == NULL) \{
+028       return MP_MEM;
+029     \}
+030     a->used = 0;
+031     a->alloc = size;
+032     a->sign = MP_ZPOS;
+033   
+034     return MP_OKAY;
+035   \}
+\end{alltt}
+\end{small}
+
+Line 23 will ensure that the number of digits actually allocated is padded up to the next multiple of 
+\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}.  This ensures that the number of allocated digit is 
+always greater than the amount requested.  As a result it prevents many trivial memory allocations.  The value of 
+\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two.
+
+\subsection{Creating a Clone}
+Another common sequence of operations is to make a local temporary copy of an argument.  To initialize then copy a mp\_int will be known as 
+creating a clone.  This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy.  
+The mp\_init\_copy algorithm will perform this very task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_copy}. \\
+\textbf{Input}.   An mp\_int $a$ and $b$\\
+\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
+\hline \\
+1.  Init $a$.  (\textit{hint: use mp\_init}) \\
+2.  If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\
+3.  Copy $b$ to $a$.  (\textit{hint: use mp\_copy}) \\
+4.  Return the status of the copy operation. \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_copy.}
+This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it.  The algorithm will
+detect when the initialization fails and returns the error to the calling algorithm.  As such this algorithm will perform two operations
+in one step.  
+
+\index{bn\_mp\_init\_copy.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* creates "a" then copies b into it */
+018   int
+019   mp_init_copy (mp_int * a, mp_int * b)
+020   \{
+021     int     res;
+022   
+023     if ((res = mp_init (a)) != MP_OKAY) \{
+024       return res;
+025     \}
+026     return mp_copy (b, a);
+027   \}
+\end{alltt}
+\end{small}
+
+This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
+\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
+and \textbf{a} will be left intact.  
+
+\subsection{Multiple Integer Initializations}
+Occasionally a function will require a series of mp\_int data types to be made available.  The mp\_init\_multi algorithm
+is provided to simplify such cases.  The purpose of this algorithm is to initialize a variable length array of mp\_int 
+structures at once.  As a result algorithms that require multiple integers only has to use 
+one algorithm to initialize all the mp\_int variables.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_init\_multi}. \\
+\textbf{Input}.   Variable length array of mp\_int variables of length $k$. \\
+\textbf{Output}.  The array is initialized such that each each mp\_int is ready to use. \\
+\hline \\
+1.  for $n$ from 0 to $k - 1$ do \\
+\hspace{+3mm}1.1.  Initialize the $n$'th mp\_int (\textit{hint: use mp\_init}) \\
+\hspace{+3mm}1.2.  If initialization failed then do \\
+\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
+\hspace{+9mm}1.2.1.1.  Free the $j$'th mp\_int (\textit{hint: use mp\_clear}) \\
+\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
+2.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_init\_multi}
+\end{figure}
+
+\textbf{Algorithm mp\_init\_multi.}
+The algorithm will initialize the array of mp\_int variables one at a time.  As soon as an runtime error is detected (\textit{step 1.2}) all of
+the previously initialized variables are cleared.  The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime 
+errors.
+
+\subsection{Multiple Integer Clearing}
+Similarly to clear a variable length list of mp\_int structures the mp\_clear\_multi algorithm will be used.
+
+\index{bn\_mp\_multi.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_multi.c
+\vspace{-3mm}
+\begin{alltt}
+016   #include <stdarg.h>
+017   
+018   int mp_init_multi(mp_int *mp, ...) 
+019   \{
+020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
+021       int n = 0;                 /* Number of ok inits */
+022       mp_int* cur_arg = mp;
+023       va_list args;
+024   
+025       va_start(args, mp);        /* init args to next argument from caller */
+026       while (cur_arg != NULL) \{
+027           if (mp_init(cur_arg) != MP_OKAY) \{
+028               /* Oops - error! Back-track and mp_clear what we already
+029                  succeeded in init-ing, then return error.
+030               */
+031               va_list clean_args;
+032               
+033               /* end the current list */
+034               va_end(args);
+035               
+036               /* now start cleaning up */            
+037               cur_arg = mp;
+038               va_start(clean_args, mp);
+039               while (n--) \{
+040                   mp_clear(cur_arg);
+041                   cur_arg = va_arg(clean_args, mp_int*);
+042               \}
+043               va_end(clean_args);
+044               res = MP_MEM;
+045               break;
+046           \}
+047           n++;
+048           cur_arg = va_arg(args, mp_int*);
+049       \}
+050       va_end(args);
+051       return res;                /* Assumed ok, if error flagged above. */
+052   \}
+053   
+054   void mp_clear_multi(mp_int *mp, ...) 
+055   \{
+056       mp_int* next_mp = mp;
+057       va_list args;
+058       va_start(args, mp);
+059       while (next_mp != NULL) \{
+060           mp_clear(next_mp);
+061           next_mp = va_arg(args, mp_int*);
+062       \}
+063       va_end(args);
+064   \}
+\end{alltt}
+\end{small}
+
+Consider the following snippet which demonstrates how to use both routines.
+\begin{small}
+\begin{verbatim}
+#include <tommath.h>
+#include <stdio.h>
+#include <stdlib.h>
+int main(void)
+{
+   mp_int num1, num2, num3;
+   int err;
+   
+   if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) {
+      printf("Error: %d\n", err);
+      return EXIT_FAILURE;
+   }
+   
+   /* at this point num1/num2/num3 are ready */
+   
+   /* free them */
+   mp_clear_multi(&num1, &num2, &num3, NULL);
+   
+   return EXIT_SUCCESS;
+}
+\end{verbatim}
+\end{small}
+
+\section{Maintenance}
+A small useful collection of mp\_int maintenance functions will also prove useful.  
+
+\subsection{Augmenting Integer Precision}
+When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without
+loss of precision.  Quite often the size of the array given by the \textbf{alloc} member is large enough to simply
+increase the \textbf{used} digit count.  However, when the size of the array is too small it must be re-sized 
+appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_grow}. \\
+\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
+\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
+\hline \\
+1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
+2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
+3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
+4.  Re-Allocate the array of digits $a$ to size $v$ \\
+5.  If the allocation failed then return(\textit{MP\_MEM}). \\
+6.  for n from a.alloc to $v - 1$ do  \\
+\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
+7.  $a.alloc \leftarrow v$ \\
+8.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_grow}
+\end{figure}
+
+\textbf{Algorithm mp\_grow.}
+Step one will prevent a re-allocation from being performed if it was not required.  This is useful to prevent mp\_ints
+from growing excessively in code that erroneously calls mp\_grow.  Similar to mp\_init\_size the requested digit count
+is padded to provide more digits than requested.  
+
+In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact.  Much akin to how the 
+\textit{realloc} function from the standard C library works.  Since the newly allocated digits are assumed to contain
+undefined values they are also initially zeroed.
+
+\index{bn\_mp\_grow.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* grow as required */
+018   int
+019   mp_grow (mp_int * a, int size)
+020   \{
+021     int     i;
+022   
+023     /* if the alloc size is smaller alloc more ram */
+024     if (a->alloc < size) \{
+025       /* ensure there are always at least MP_PREC digits extra on top */
+026       size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
+027   
+028       a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
+029       if (a->dp == NULL) \{
+030         return MP_MEM;
+031       \}
+032   
+033       /* zero excess digits */
+034       i        = a->alloc;
+035       a->alloc = size;
+036       for (; i < a->alloc; i++) \{
+037         a->dp[i] = 0;
+038       \}
+039     \}
+040     return MP_OKAY;
+041   \}
+\end{alltt}
+\end{small}
+
+The first step is to see if we actually need to perform a re-allocation at all.  This is tested for on line 
+24.  Similar to mp\_init\_size the same code on line 26 was used to resize the 
+digits requested.  A simple for loop from line 34 to line 38 will zero all digits that were above the 
+old \textbf{alloc} limit to make sure the integer is in a known state.
+
+\subsection{Clamping Excess Digits}
+When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
+the function.  For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most 
+$i + j + 1$ digits.  It is entirely possible that the result is $i + j$ though, with no final carry into the last 
+position.  However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j$
+digits than further expanded to accomodate the final carry.  That would be a considerable waste of time since heap
+operations are relatively slow.
+
+The ideal solution is to always assume the result is $i + j + 1$ and fix up the \textbf{used} count after the function
+terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
+there would be an excess high order zero digit.  
+
+For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
+will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
+accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
+low the representation is excessively large.  
+
+The mp\_clamp algorithm is designed to solve this very problem.  It will trim leading zeros by decrementing the 
+\textbf{used} count until a non-zero leading digit is found.  Also in this system, zero is considered to be a positive 
+number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_clamp}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
+\hline \\
+1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
+\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
+2.  if $a.used = 0$ then do \\
+\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
+\hline \\
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_clamp}
+\end{figure}
+
+\textbf{Algorithm mp\_clamp.}
+As can be expected this algorithm is very simple.  The loop on step one is indended to be iterate only once or twice at
+the most.  For example, for cases where there is not a carry to fill the last position.  Step two fixes the sign for 
+when all of the digits are zero to ensure that the mp\_int is valid at all times.
+
+\index{bn\_mp\_clamp.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* trim unused digits 
+018    *
+019    * This is used to ensure that leading zero digits are
+020    * trimed and the leading "used" digit will be non-zero
+021    * Typically very fast.  Also fixes the sign if there
+022    * are no more leading digits
+023    */
+024   void
+025   mp_clamp (mp_int * a)
+026   \{
+027     while (a->used > 0 && a->dp[a->used - 1] == 0) \{
+028       --(a->used);
+029     \}
+030     if (a->used == 0) \{
+031       a->sign = MP_ZPOS;
+032     \}
+033   \}
+\end{alltt}
+\end{small}
+
+Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
+language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
+important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
+undesirable.  The parenthesis on line 28 is used to make sure the \textbf{used} count is decremented and not
+the pointer ``a''.  
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
+                     & \\
+$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
+                     & encryption when $\beta = 2^{28}$.  \\
+                     & \\
+$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
+                     & \\
+$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
+                     & \\
+\end{tabular}
+
+
+\chapter{Basic Operations}
+\section{Copying an Integer}
+After the various house-keeping routines are in place, simpl algorithms can be designed to take advantage of them.  Being able
+to make a verbatim copy of an integer is a very useful function to have.  To copy an integer the mp\_copy algorithm will be used.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_copy}. \\
+\textbf{Input}.  An mp\_int $a$ and $b$. \\
+\textbf{Output}.  Store a copy of $a$ in $b$. \\
+\hline \\
+1.  Check if $a$ and $b$ point to the same location in memory. \\
+2.  If true then return(\textit{MP\_OKAY}). \\
+3.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{hint: use mp\_grow}) \\
+4.  If failed to grow then return(\textit{MP\_MEM}). \\
+5.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}5.1  $b_{n} \leftarrow a_{n}$ \\
+6.  if $a.used < b.used - 1$ then \\ 
+\hspace{3mm}6.1.  for $n$ from $a.used$ to $b.used - 1$ do \\
+\hspace{6mm}6.1.1  $b_{n} \leftarrow 0$ \\
+7.  $b.used \leftarrow a.used$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_copy}
+\end{figure}
+
+\textbf{Algorithm mp\_copy.}
+Step 1 and 2 make sure that the two mp\_ints are unique.  This allows the user to call the copy function with
+potentially the same input and not waste time.  Step 3 and 4 ensure that the destination is large enough to
+hold a copy of the input $a$.  Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used}
+member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller.  This
+prevents trivial memory reallocations.
+
+Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$,
+the leading digits of $b$ will be zeroed.  Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over 
+which completes the copy operation.
+
+\index{bn\_mp\_copy.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* copy, b = a */
+018   int
+019   mp_copy (mp_int * a, mp_int * b)
+020   \{
+021     int     res, n;
+022   
+023     /* if dst == src do nothing */
+024     if (a == b || a->dp == b->dp) \{
+025       return MP_OKAY;
+026     \}
+027   
+028     /* grow dest */
+029     if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+030       return res;
+031     \}
+032   
+033     /* zero b and copy the parameters over */
+034     \{
+035       register mp_digit *tmpa, *tmpb;
+036   
+037       /* pointer aliases */
+038       tmpa = a->dp;
+039       tmpb = b->dp;
+040   
+041       /* copy all the digits */
+042       for (n = 0; n < a->used; n++) \{
+043         *tmpb++ = *tmpa++;
+044       \}
+045   
+046       /* clear high digits */
+047       for (; n < b->used; n++) \{
+048         *tmpb++ = 0;
+049       \}
+050     \}
+051     b->used = a->used;
+052     b->sign = a->sign;
+053     return MP_OKAY;
+054   \}
+\end{alltt}
+\end{small}
+
+Source lines 23-31 do the initial house keeping.  That is to see if the input is unique and if so to 
+make sure there is enough room.  If not enough space is available it returns the error and leaves the destination variable
+intact.
+
+The inner loop of the copy operation is contained between lines 34 and 50.  Many LibTomMath routines are designed with this source code style
+in mind, making aliases to shorten lengthy pointers (\textit{see line 38 and 39}) for rapid to use.  Also the
+use of nested braces creates a simple way to denote various portions of code that reside on various work levels.  Here, the copy loop is at the 
+$O(n)$ level.  
+
+\section{Zeroing an Integer}
+Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
+perform this task.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_zero}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Zero the contents of $a$ \\
+\hline \\
+1.  $a.used \leftarrow 0$ \\
+2.  $a.sign \leftarrow$ MP\_ZPOS \\
+3.  for $n$ from 0 to $a.alloc - 1$ do \\
+\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_zero}
+\end{figure}
+
+\textbf{Algorithm mp\_zero.}
+This algorithm simply resets a mp\_int to the default state.  
+
+\index{bn\_mp\_zero.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to zero */
+018   void
+019   mp_zero (mp_int * a)
+020   \{
+021     a->sign = MP_ZPOS;
+022     a->used = 0;
+023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+024   \}
+\end{alltt}
+\end{small}
+
+After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
+\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
+
+\section{Sign Manipulation}
+\subsection{Absolute Value}
+With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
+the absolute value of an mp\_int.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_abs}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = \vert a \vert$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{hint: use mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  $b.sign \leftarrow MP\_ZPOS$ \\
+4.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_abs}
+\end{figure}
+
+\textbf{Algorithm mp\_abs.}
+This algorithm computes the absolute of an mp\_int input.  As can be expected the algorithm is very trivial.
+
+\index{bn\_mp\_abs.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = |a| 
+018    *
+019    * Simple function copies the input and fixes the sign to positive
+020    */
+021   int
+022   mp_abs (mp_int * a, mp_int * b)
+023   \{
+024     int     res;
+025     if ((res = mp_copy (a, b)) != MP_OKAY) \{
+026       return res;
+027     \}
+028     b->sign = MP_ZPOS;
+029     return MP_OKAY;
+030   \}
+\end{alltt}
+\end{small}
+
+\subsection{Integer Negation}
+With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
+the negative of an mp\_int input.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_neg}. \\
+\textbf{Input}.   An mp\_int $a$ \\
+\textbf{Output}.  Computes $b = -a$ \\
+\hline \\
+1.  Copy $a$ to $b$.  (\textit{hint: use mp\_copy}) \\
+2.  If the copy failed return(\textit{MP\_MEM}). \\
+3.  If $a.sign = MP\_ZPOS$ then do \\
+\hspace{3mm}3.1  $b.sign = MP\_NEG$. \\
+4.  else do \\
+\hspace{3mm}4.1  $b.sign = MP\_ZPOS$. \\
+5.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_neg}
+\end{figure}
+
+\textbf{Algorithm mp\_neg.}
+This algorithm computes the negation of an input.  
+
+\index{bn\_mp\_neg.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = -a */
+018   int
+019   mp_neg (mp_int * a, mp_int * b)
+020   \{
+021     int     res;
+022     if ((res = mp_copy (a, b)) != MP_OKAY) \{
+023       return res;
+024     \}
+025     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+026     return MP_OKAY;
+027   \}
+\end{alltt}
+\end{small}
+
+\section{Small Constants}
+\subsection{Setting Small Constants}
+Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
+
+\newpage\begin{figure}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set}. \\
+\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{hint: use mp\_zero}). \\
+2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
+3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
+                              1 &  \mbox{if }a_0 > 0 \\
+                              0 &  \mbox{if }a_0 = 0 
+                              \end{array} \right .$ \\
+\hline                              
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set}
+\end{figure}
+
+\textbf{Algorithm mp\_set.}
+This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
+single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
+
+\index{bn\_mp\_set.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set to a digit */
+018   void
+019   mp_set (mp_int * a, mp_digit b)
+020   \{
+021     mp_zero (a);
+022     a->dp[0] = b & MP_MASK;
+023     a->used = (a->dp[0] != 0) ? 1 : 0;
+024   \}
+\end{alltt}
+\end{small}
+
+Line 21 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 22 actually copies digit 
+into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
+reduce an integer modulo $\beta$.  Since $\beta = 2^k$ it suffices to perform a binary AND with $MP\_MASK = 2^k - 1$ to perform
+the reduction.  Finally line 23 will set the \textbf{used} member with respect to the digit actually set. This function 
+will always make the integer positive.
+
+One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
+this function should take that into account.  The define \textbf{DIGIT\_BIT} in ``tommath.h'' 
+defines how many bits per digit are available.  Generally at least seven bits are guaranteed to be available per 
+digit.  This means that trivially small constants can be set using this function.
+
+\subsection{Setting Large Constants}
+To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided.  It accepts a ``long''
+data type as input and will always treat it as a 32-bit integer.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_set\_int}. \\
+\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
+\textbf{Output}.  Make $a$ equivalent to $b$ \\
+\hline \\
+1.  Zero $a$ (\textit{hint: use mp\_zero}) \\
+2.  for $n$ from 0 to 7 do \\
+\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{hint: use mp\_mul2d}) \\
+\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
+\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
+\hspace{3mm}2.4  $a.used \leftarrow a.used + \lfloor 32 / lg(\beta) \rfloor + 1$ \\
+3.  Clamp excess used digits (\textit{hint: use mp\_clamp}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_set\_int}
+\end{figure}
+
+\textbf{Algorithm mp\_set\_int.}
+The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
+mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits.  In step 2.2 the
+next four bits from the source are extracted.  The four bits are added to the mp\_int and the \textbf{used} digit count is 
+incremented.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
+zero digits used and the newly added four bits would be ignored.
+
+Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
+
+\index{bn\_mp\_set\_int.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* set a 32-bit const */
+018   int
+019   mp_set_int (mp_int * a, unsigned int b)
+020   \{
+021     int     x, res;
+022   
+023     mp_zero (a);
+024     /* set four bits at a time */
+025     for (x = 0; x < 8; x++) \{
+026       /* shift the number up four bits */
+027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
+028         return res;
+029       \}
+030   
+031       /* OR in the top four bits of the source */
+032       a->dp[0] |= (b >> 28) & 15;
+033   
+034       /* shift the source up to the next four bits */
+035       b <<= 4;
+036   
+037       /* ensure that digits are not clamped off */
+038       a->used += 32 / DIGIT_BIT + 2;
+039     \}
+040     mp_clamp (a);
+041     return MP_OKAY;
+042   \}
+\end{alltt}
+\end{small}
+
+This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
+addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not 
+seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 
+as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps 
+the number of used digits low.
+
+\section{Comparisons}
+\subsection{Unsigned Comparisions}
+Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
+to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
+to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
+positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
+
+The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
+mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
+signs are known to agree in advance.
+
+To facilitate working with the results of the comparison functions three constants are required.  
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|r|l|}
+\hline \textbf{Constant} & \textbf{Meaning} \\
+\hline \textbf{MP\_GT} & Greater Than \\
+\hline \textbf{MP\_EQ} & Equal To \\
+\hline \textbf{MP\_LT} & Less Than \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Comparison Return Codes}
+\end{figure}
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp\_mag}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
+\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
+\hline \\
+1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
+2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
+3.  for n from $a.used - 1$ to 0 do \\
+\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
+\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
+4.  Return(\textit{MP\_EQ}) \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp\_mag}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp\_mag.}
+By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
+\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
+Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
+If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
+
+By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
+the zero'th digit.  If after all of the digits have been compared and no difference found the algorithm simply returns \textbf{MP\_EQ}.
+
+\index{bn\_mp\_cmp\_mag.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare maginitude of two ints (unsigned) */
+018   int
+019   mp_cmp_mag (mp_int * a, mp_int * b)
+020   \{
+021     int     n;
+022   
+023     /* compare based on # of non-zero digits */
+024     if (a->used > b->used) \{
+025       return MP_GT;
+026     \} 
+027     
+028     if (a->used < b->used) \{
+029       return MP_LT;
+030     \}
+031   
+032     /* compare based on digits  */
+033     for (n = a->used - 1; n >= 0; n--) \{
+034       if (a->dp[n] > b->dp[n]) \{
+035         return MP_GT;
+036       \} 
+037       
+038       if (a->dp[n] < b->dp[n]) \{
+039         return MP_LT;
+040       \}
+041     \}
+042     return MP_EQ;
+043   \}
+\end{alltt}
+\end{small}
+
+The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
+are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
+without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ passed the end of the 
+array of digits.
+
+\subsection{Signed Comparisons}
+Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
+comparison a trivial signed comparison algorithm can be written.
+
+\newpage\begin{figure}[here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_cmp}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
+\hline \\
+1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
+2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
+3.  if $a.sign = MP\_NEG$ then \\
+\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{hint: use mp\_cmp\_mag}) \\
+4   Otherwise \\
+\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_cmp}
+\end{figure}
+
+\textbf{Algorithm mp\_cmp.}
+The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
+comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
+three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
+$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
+
+\index{bn\_mp\_cmp.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* compare two ints (signed)*/
+018   int
+019   mp_cmp (mp_int * a, mp_int * b)
+020   \{
+021     /* compare based on sign */
+022     if (a->sign == MP_NEG && b->sign == MP_ZPOS) \{
+023       return MP_LT;
+024     \} 
+025     
+026     if (a->sign == MP_ZPOS && b->sign == MP_NEG) \{
+027       return MP_GT;
+028     \}
+029     
+030     /* compare digits */
+031     if (a->sign == MP_NEG) \{
+032        /* if negative compare opposite direction */
+033        return mp_cmp_mag(b, a);
+034     \} else \{
+035        return mp_cmp_mag(a, b);
+036     \}
+037   \}
+\end{alltt}
+\end{small}
+
+The two if statements on lines 22 and 26 perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
+the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
+be both positive and a forward direction unsigned comparison is performed.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
+                     & \\
+$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
+                     & of two random digits (of equal magnitude) before a difference is found. \\
+                     & \\
+$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
+                     & on the observations made in the previous problem. \\
+                     &
+\end{tabular}
+
+\chapter{Basic Arithmetic}
+\section{Building Blocks}
+At this point algorithms for initialization, de-initialization, zeroing, copying, comparing and setting small constants have been 
+established.  The next logical set of algorithms to develop are the addition, subtraction and digit movement algorithms.  These 
+algorithms make use of the lower level algorithms and are the cruicial building block for the multipliers.  It is very important that these 
+algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
+which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
+
+All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
+logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
+number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}).  
+Mathematically a logical shift is equivalent to a division or multiplication by a power of two.  
+For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
+
+One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
+from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
+result is $110_2$.  
+
+\section{Addition and Subtraction}
+In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
+$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
+As a result subtraction can be performed with a trivial series of logical operations and an addition.
+
+However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
+sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
+subtraction algorithms with the sign fixed up appropriately.
+
+The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
+the integers respectively.
+
+\subsection{Low Level Addition}
+An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
+trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
+Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
+
+\newpage
+\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
+\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
+\hline \\
+1.  if $a.used > b.used$ then \\
+\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
+\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
+\hspace{+3mm}1.3  $x   \leftarrow a$ \\
+2.  else  \\
+\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
+\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
+\hspace{+3mm}2.3  $x   \leftarrow b$ \\
+3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{hint: use mp\_grow}) \\
+4.  If failed to grow $c$ return(\textit{MP\_MEM}) \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow max + 1$ \\
+7.  $u \leftarrow 0$ \\
+8.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{+3mm}8.1  $c_n \leftarrow a_n + b_n + u$ \\
+\hspace{+3mm}8.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  if $min \ne max$ then do \\
+\hspace{+3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{+6mm}9.1.1  $c_n \leftarrow x_n + u$ \\
+\hspace{+6mm}9.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
+\hspace{+6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+10.  $c_{max} \leftarrow u$ \\
+11.  if $olduse > max$ then \\
+\hspace{+3mm}11.1  for $n$ from $max + 1$ to $olduse - 1$ do \\
+\hspace{+6mm}11.1.1  $c_n \leftarrow 0$ \\
+12.  Clamp excess digits in $c$.  (\textit{hint: use mp\_clamp}) \\
+13.  Return(\textit{MP\_OKAY}) \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_add}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_add.}
+This algorithm is loosely based on algorithm 14.7 of \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
+Coincidentally the description of algorithm A in \cite[pp. 266]{TAOCPV2} shares the same flaw as that from \cite{HAC}.  Even the MIX pseudo 
+machine code presented  \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
+
+Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count.  This allows the inputs to have varying magnitudes which not 
+only makes it more efficient than the trivial algorithm presented in the other references but more flexible.  The variable $min$ is given the lowest 
+digit count while $max$ is given the highest digit count.  If both inputs have the same \textbf{used} digit count both $min$ and $max$ are 
+set to the same.  The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it.  After the inputs are sorted steps 
+3 and 4 will ensure that the destination $c$ can accommodate the result.  The old \textbf{used} count from $c$ is copied to $oldused$ and the 
+new count is set to $max + 1$.  
+
+At step 7 the carry variable $u$ is set to zero and the first leg of the addition loop can begin.  The first step of the loop (\textit{8.1}) adds
+digits from the two inputs together along with the carry variable $u$.  The following step extracts the carry bit by shifting the result of the
+preceding step right $lg(\beta)$ positions.  The shift to extract the carry is similar to how carry extraction works with decimal addition.
+
+Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$.  The trailing digit of the result
+is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$.  The
+division and multiplication of $10$ is simply a logical shift right or left respectively of the digits.  In otherwords the carry can be extracted
+by shifting one digit to the right.
+
+Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$.  This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ 
+digit.  Therefore, a logical shift right of the single digit by $lg(\beta)$ will extract the carry.  The final step of the  loop reduces the digit 
+modulo the radix $\beta$ to ensure it is in range.
+
+After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted.  Step 9 decides whether
+the inputs were of equal magnitude.  If not than another loop similar to that in step 8 must be executed.  The loop at step
+number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry.  
+
+Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$.  Step 11 ensures that 
+leading digits that were originally present in $c$ are cleared.  Finally excess leading digits are clamped and the algorithm returns success.
+
+\index{bn\_s\_mp\_add.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
+018   int
+019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     mp_int *x;
+022     int     olduse, res, min, max;
+023   
+024     /* find sizes, we let |a| <= |b| which means we have to sort
+025      * them.  "x" will point to the input with the most digits
+026      */
+027     if (a->used > b->used) \{
+028       min = b->used;
+029       max = a->used;
+030       x = a;
+031     \} else \{
+032       min = a->used;
+033       max = b->used;
+034       x = b;
+035     \}
+036   
+037     /* init result */
+038     if (c->alloc < max + 1) \{
+039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
+040         return res;
+041       \}
+042     \}
+043   
+044     /* get old used digit count and set new one */
+045     olduse = c->used;
+046     c->used = max + 1;
+047   
+048     /* set the carry to zero */
+049     \{
+050       register mp_digit u, *tmpa, *tmpb, *tmpc;
+051       register int i;
+052   
+053       /* alias for digit pointers */
+054   
+055       /* first input */
+056       tmpa = a->dp;
+057   
+058       /* second input */
+059       tmpb = b->dp;
+060   
+061       /* destination */
+062       tmpc = c->dp;
+063   
+064       /* zero the carry */
+065       u = 0;
+066       for (i = 0; i < min; i++) \{
+067         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+068         *tmpc = *tmpa++ + *tmpb++ + u;
+069   
+070         /* U = carry bit of T[i] */
+071         u = *tmpc >> ((mp_digit)DIGIT_BIT);
+072   
+073         /* take away carry bit from T[i] */
+074         *tmpc++ &= MP_MASK;
+075       \}
+076   
+077       /* now copy higher words if any, that is in A+B 
+078        * if A or B has more digits add those in 
+079        */
+080       if (min != max) \{
+081         for (; i < max; i++) \{
+082           /* T[i] = X[i] + U */
+083           *tmpc = x->dp[i] + u;
+084   
+085           /* U = carry bit of T[i] */
+086           u = *tmpc >> ((mp_digit)DIGIT_BIT);
+087   
+088           /* take away carry bit from T[i] */
+089           *tmpc++ &= MP_MASK;
+090         \}
+091       \}
+092   
+093       /* add carry */
+094       *tmpc++ = u;
+095   
+096       /* clear digits above oldused */
+097       for (i = c->used; i < olduse; i++) \{
+098         *tmpc++ = 0;
+099       \}
+100     \}
+101   
+102     mp_clamp (c);
+103     return MP_OKAY;
+104   \}
+\end{alltt}
+\end{small}
+
+Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is pointer to a 
+mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
+accomodate the result of the addition. 
+
+Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases on 
+lines 56, 59 and 62 are the for the two inputs and destination respectively.  These aliases are used to ensure the
+compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
+
+The initial carry $u$ is cleared on line 65, note that $u$ is of type mp\_digit which ensures type compatibility within the 
+implementation.  The initial addition loop begins on line 66 and ends on line 75.  Similarly the conditional addition loop
+begins on line 81 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 94.  
+Note the ``++'' operator on the same line.  After line 94 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop on lines 97 to 99 which set any old upper digits to zero.
+
+\subsection{Low Level Subtraction}
+The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
+unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
+be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
+This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
+
+
+For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
+the range $0 \le x < 2\beta$.  It is allowable that a mp\_digit represent a larger range of values.  For this algorithm we will assume that
+the variable $\gamma$ represents the number of bits available in a mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{small}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
+\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
+\hline \\
+1.  $min \leftarrow b.used$ \\
+2.  $max \leftarrow a.used$ \\
+3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{hint: use mp\_grow}) \\
+4.  If the reallocation failed return(\textit{MP\_MEM}). \\
+5.  $oldused \leftarrow c.used$ \\ 
+6.  $c.used \leftarrow max$ \\
+7.  $u \leftarrow 0$ \\
+8.  for $n$ from $0$ to $min - 1$ do \\
+\hspace{3mm}8.1  $c_n \leftarrow a_n - b_n - u$ \\
+\hspace{3mm}8.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+9.  if $min < max$ then do \\
+\hspace{3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
+\hspace{6mm}9.1.1  $c_n \leftarrow a_n - u$ \\
+\hspace{6mm}9.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
+\hspace{6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
+10. if $oldused > max$ then do \\
+\hspace{3mm}10.1  for $n$ from $max$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11. Clamp excess digits of $c$.  (\textit{hint: use mp\_clamp}). \\
+12. Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Algorithm s\_mp\_sub}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_sub.}
+This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
+passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
+algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
+of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
+
+The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
+set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
+most $max$ digits in length as oppose to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
+set to the maximal count for the operation.
+
+The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
+subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry within the subtraction loops.  Under the assumption
+that two's complement single precision arithmetic is used this will successfully extract the carry.  
+
+For example, consider subtracting $0101_2$ from
+$0100_2$ where $\gamma = 4$.  The least significant bit will force a carry upwards to the third bit which will be set to zero after the borrow.  After
+the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the third bit of $0101_2$ is subtracted from the result it will cause
+another carry.  In this case though the carry will be forced to propagate all the way to the most significant bit.  
+
+Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur it will propagate all the way to the most significant bit.  Therefore a single
+logical shift right by $\gamma - 1$ positions is sufficient to extract the carry.  This method of carry extraction may seem awkward but the reason for 
+it becomes apparent when the implementation is discussed.  
+
+If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
+10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
+
+\index{bn\_s\_mp\_sub.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
+018   int
+019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     olduse, res, min, max;
+022   
+023     /* find sizes */
+024     min = b->used;
+025     max = a->used;
+026   
+027     /* init result */
+028     if (c->alloc < max) \{
+029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
+030         return res;
+031       \}
+032     \}
+033     olduse = c->used;
+034     c->used = max;
+035   
+036     /* sub digits from lower part */
+037     \{
+038       register mp_digit u, *tmpa, *tmpb, *tmpc;
+039       register int i;
+040   
+041       /* alias for digit pointers */
+042       tmpa = a->dp;
+043       tmpb = b->dp;
+044       tmpc = c->dp;
+045   
+046       /* set carry to zero */
+047       u = 0;
+048       for (i = 0; i < min; i++) \{
+049         /* T[i] = A[i] - B[i] - U */
+050         *tmpc = *tmpa++ - *tmpb++ - u;
+051   
+052         /* U = carry bit of T[i]
+053          * Note this saves performing an AND operation since
+054          * if a carry does occur it will propagate all the way to the
+055          * MSB.  As a result a single shift is required to get the carry
+056          */
+057         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+058   
+059         /* Clear carry from T[i] */
+060         *tmpc++ &= MP_MASK;
+061       \}
+062   
+063       /* now copy higher words if any, e.g. if A has more digits than B  */
+064       for (; i < max; i++) \{
+065         /* T[i] = A[i] - U */
+066         *tmpc = *tmpa++ - u;
+067   
+068         /* U = carry bit of T[i] */
+069         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
+070   
+071         /* Clear carry from T[i] */
+072         *tmpc++ &= MP_MASK;
+073       \}
+074   
+075       /* clear digits above used (since we may not have grown result above) */
+      
+076       for (i = c->used; i < olduse; i++) \{
+077         *tmpc++ = 0;
+078       \}
+079     \}
+080   
+081     mp_clamp (c);
+082     return MP_OKAY;
+083   \}
+\end{alltt}
+\end{small}
+
+Line 24 and 25 perform the initial hardcoded sorting.  In reality they are only aliases and are only used to make the source easier to 
+read.  Again the pointer alias optimization is used within this algorithm.  Lines 42, 43 and 44 initialize the aliases for 
+$a$, $b$ and $c$ respectively.
+
+The first subtraction loop occurs on lines 47 through 61.  The theory behind the subtraction loop is exactly the same as that for
+the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
+(\textit{see line 57}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
+the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
+occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
+shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
+twos compliment machines which is a safe assumption to make.
+
+If $a$ has a higher magnitude than $b$ an additional loop (\textit{see lines 64 through 73}) is required to propagate the carry through
+$a$ and copy the result to $c$.  
+
+\subsection{High Level Addition}
+Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
+established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
+types.  
+
+Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
+flag.  A high level addition is actually performed as a series of eight seperate cases which can be optimized down to three unique cases.
+
+\newpage\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_add}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed addition $c = a + b$. \\
+\hline \\
+1.  if $a.sign = b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add})\\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag})  \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{hint: use s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
+3.  If any of the lower level operations failed return(\textit{MP\_MEM}) \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_add}
+\end{figure}
+
+\textbf{Algorithm mp\_add.}
+This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from either \cite{TAOCPV2} or 
+\cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly straightforward but restricted since subtraction can only 
+produce positive results.  Consider the following chart of possible inputs.
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&&\\
+
+\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
+\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
+
+\hline &&&&\\
+
+\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Addition Guide Chart}
+\end{figure}
+
+The chart lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled.  The 
+return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors.  This simpliies the description
+of the algorithm considerably and best follows how the implementation actually was achieved.
+
+Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
+s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
+to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.  
+
+For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
+produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
+within algorithm s\_mp\_add will force $-0$ to become $0$.  
+
+\index{bn\_mp\_add.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level addition (handles signs) */
+018   int
+019   mp_add (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     sa, sb, res;
+022   
+023     /* get sign of both inputs */
+024     sa = a->sign;
+025     sb = b->sign;
+026   
+027     /* handle two cases, not four */
+028     if (sa == sb) \{
+029       /* both positive or both negative */
+030       /* add their magnitudes, copy the sign */
+031       c->sign = sa;
+032       res = s_mp_add (a, b, c);
+033     \} else \{
+034       /* one positive, the other negative */
+035       /* subtract the one with the greater magnitude from */
+036       /* the one of the lesser magnitude.  The result gets */
+037       /* the sign of the one with the greater magnitude. */
+038       if (mp_cmp_mag (a, b) == MP_LT) \{
+039         c->sign = sb;
+040         res = s_mp_sub (b, a, c);
+041       \} else \{
+042         c->sign = sa;
+043         res = s_mp_sub (a, b, c);
+044       \}
+045     \}
+046     return res;
+047   \}
+048   
+\end{alltt}
+\end{small}
+
+The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
+is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
+explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
+level functions do so.  Returning their return code is sufficient.
+
+\subsection{High Level Subtraction}
+The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
+
+\begin{figure}[!here]
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_sub}. \\
+\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
+\textbf{Output}.  The signed subtraction $c = a - b$. \\
+\hline \\
+1.  if $a.sign \ne b.sign$ then do \\
+\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{hint: use s\_mp\_add}) \\
+2.  else do \\
+\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{hint: use mp\_cmp\_mag}) \\
+\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
+\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{hint: use s\_mp\_sub}) \\
+\hspace{3mm}2.2  else do \\
+\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
+                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
+                              MP\_NEG  &  \mbox{otherwise} \\
+                              \end{array} \right .$ \\
+\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
+3.  If any of the lower level operations failed return(\textit{MP\_MEM}). \\
+4.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\caption{Algorithm mp\_sub}
+\end{figure}
+
+\textbf{Algorithm mp\_sub.}
+This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
+\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  The following chart lists the eight possible inputs and
+the operations required.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|}
+\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
+\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
+\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
+\hline &&&& \\
+\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Subtraction Guide Chart}
+\end{figure}
+
+Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
+algorithm from producing $-a - -a = -0$ as a result.  
+
+\index{bn\_mp\_sub.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* high level subtraction (handles signs) */
+018   int
+019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
+020   \{
+021     int     sa, sb, res;
+022   
+023     sa = a->sign;
+024     sb = b->sign;
+025   
+026     if (sa != sb) \{
+027       /* subtract a negative from a positive, OR */
+028       /* subtract a positive from a negative. */
+029       /* In either case, ADD their magnitudes, */
+030       /* and use the sign of the first number. */
+031       c->sign = sa;
+032       res = s_mp_add (a, b, c);
+033     \} else \{
+034       /* subtract a positive from a positive, OR */
+035       /* subtract a negative from a negative. */
+036       /* First, take the difference between their */
+037       /* magnitudes, then... */
+038       if (mp_cmp_mag (a, b) != MP_LT) \{
+039         /* Copy the sign from the first */
+040         c->sign = sa;
+041         /* The first has a larger or equal magnitude */
+042         res = s_mp_sub (a, b, c);
+043       \} else \{
+044         /* The result has the *opposite* sign from */
+045         /* the first number. */
+046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+047         /* The second has a larger magnitude */
+048         res = s_mp_sub (b, a, c);
+049       \}
+050     \}
+051     return res;
+052   \}
+053   
+\end{alltt}
+\end{small}
+
+Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
+and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
+``greater than or equal to'' comparison.  
+
+\section{Bit and Digit Shifting}
+It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
+This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
+
+In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
+the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
+are on radix-$\beta$ digits.  
+
+\subsection{Multiplication by Two}
+
+In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
+operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = 2a$. \\
+\hline \\
+1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{hint: use mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from 0 to $a.used - 1$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $r \ne 0$ then do \\
+\hspace{3mm}7.1  $b_{a.used} = 1$ \\
+\hspace{3mm}7.2  $b.used \leftarrow b.used + 1$ \\
+8.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}8.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}8.1.1  $b_n \leftarrow 0$ \\
+9.  $b.sign \leftarrow a.sign$ \\
+10.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2.}
+This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
+an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
+it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
+
+Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
+is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
+
+Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
+are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
+obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
+the previous carry.  Recall from section 5.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
+forwarding the carry to the next iteration.
+
+Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to one and augmenting the \textbf{used} count.  Step 8 clears
+any original leading digits of $b$.
+
+\index{bn\_mp\_mul\_2.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a*2 */
+018   int
+019   mp_mul_2 (mp_int * a, mp_int * b)
+020   \{
+021     int     x, res, oldused;
+022   
+023     /* grow to accomodate result */
+024     if (b->alloc < a->used + 1) \{
+025       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
+026         return res;
+027       \}
+028     \}
+029   
+030     oldused = b->used;
+031     b->used = a->used;
+032   
+033     \{
+034       register mp_digit r, rr, *tmpa, *tmpb;
+035   
+036       /* alias for source */
+037       tmpa = a->dp;
+038       
+039       /* alias for dest */
+040       tmpb = b->dp;
+041   
+042       /* carry */
+043       r = 0;
+044       for (x = 0; x < a->used; x++) \{
+045       
+046         /* get what will be the *next* carry bit from the 
+047          * MSB of the current digit 
+048          */
+049         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
+050         
+051         /* now shift up this digit, add in the carry [from the previous] */
+052         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
+053         
+054         /* copy the carry that would be from the source 
+055          * digit into the next iteration 
+056          */
+057         r = rr;
+058       \}
+059   
+060       /* new leading digit? */
+061       if (r != 0) \{
+062         /* add a MSB which is always 1 at this point */
+063         *tmpb = 1;
+064         ++b->used;
+065       \}
+066   
+067       /* now zero any excess digits on the destination 
+068        * that we didn't write to 
+069        */
+070       tmpb = b->dp + b->used;
+071       for (x = b->used; x < oldused; x++) \{
+072         *tmpb++ = 0;
+073       \}
+074     \}
+075     b->sign = a->sign;
+076     return MP_OKAY;
+077   \}
+\end{alltt}
+\end{small}
+
+This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
+is the use of the logical shift operator on line 52 to perform a single precision doubling.  
+
+\subsection{Division by Two}
+A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2}. \\
+\textbf{Input}.   One mp\_int $a$ \\
+\textbf{Output}.  $b = a/2$. \\
+\hline \\
+1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{hint: use mp\_grow}) \\
+2.  If the reallocation failed return(\textit{MP\_MEM}). \\
+3.  $oldused \leftarrow b.used$ \\
+4.  $b.used \leftarrow a.used$ \\
+5.  $r \leftarrow 0$ \\
+6.  for $n$ from $b.used - 1$ to $0$ do \\
+\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
+\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{3mm}6.3  $r \leftarrow rr$ \\
+7.  If $b.used < oldused - 1$ then do \\
+\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
+\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
+8.  $b.sign \leftarrow a.sign$ \\
+9.  Return(\textit{MP\_OKAY}).\\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2.}
+This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
+core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
+could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
+reading passed the end of the array of digits.
+
+Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
+least significant bit not the most significant bit.  
+
+\index{bn\_mp\_div\_2.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* b = a/2 */
+018   int
+019   mp_div_2 (mp_int * a, mp_int * b)
+020   \{
+021     int     x, res, oldused;
+022   
+023     /* copy */
+024     if (b->alloc < a->used) \{
+025       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
+026         return res;
+027       \}
+028     \}
+029   
+030     oldused = b->used;
+031     b->used = a->used;
+032     \{
+033       register mp_digit r, rr, *tmpa, *tmpb;
+034   
+035       /* source alias */
+036       tmpa = a->dp + b->used - 1;
+037   
+038       /* dest alias */
+039       tmpb = b->dp + b->used - 1;
+040   
+041       /* carry */
+042       r = 0;
+043       for (x = b->used - 1; x >= 0; x--) \{
+044         /* get the carry for the next iteration */
+045         rr = *tmpa & 1;
+046   
+047         /* shift the current digit, add in carry and store */
+048         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+049   
+050         /* forward carry to next iteration */
+051         r = rr;
+052       \}
+053   
+054       /* zero excess digits */
+055       tmpb = b->dp + b->used;
+056       for (x = b->used; x < oldused; x++) \{
+057         *tmpb++ = 0;
+058       \}
+059     \}
+060     b->sign = a->sign;
+061     mp_clamp (b);
+062     return MP_OKAY;
+063   \}
+\end{alltt}
+\end{small}
+
+\section{Polynomial Basis Operations}
+Recall from section 5.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
+the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
+place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
+division and Karatsuba multiplication.  
+
+Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
+$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
+polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
+
+\subsection{Multiplication by $x$}
+
+Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
+degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
+multiplying by the integer $\beta$.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_lshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (Multiply by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
+2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{hint: use mp\_grow}). \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  $a.used \leftarrow a.used + b$ \\
+5.  $i \leftarrow a.used - 1$ \\
+6.  $j \leftarrow a.used - 1 - b$ \\
+7.  for $n$ from $a.used - 1$ to $b$ do \\
+\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
+\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
+\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
+8.  for $n$ from 0 to $b - 1$ do \\
+\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_lshd}
+\end{figure}
+
+\textbf{Algorithm mp\_lshd.}
+This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
+from the other algorithms presented so far as it performs the operation in place instead storing the result in a seperate location.  The algorithm
+will return success immediately if $b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
+
+First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
+the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
+The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
+step 8 sets the lower $b$ digits to zero.
+
+\newpage
+\begin{center}
+\begin{figure}[here]
+\includegraphics{pics/sliding_window.ps}
+\caption{Sliding Window Movement}
+\end{figure}
+\end{center}
+
+\index{bn\_mp\_lshd.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift left a certain amount of digits */
+018   int
+019   mp_lshd (mp_int * a, int b)
+020   \{
+021     int     x, res;
+022   
+023     /* if its less than zero return */
+024     if (b <= 0) \{
+025       return MP_OKAY;
+026     \}
+027   
+028     /* grow to fit the new digits */
+029     if (a->alloc < a->used + b) \{
+030        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
+031          return res;
+032        \}
+033     \}
+034   
+035     \{
+036       register mp_digit *tmpa, *tmpaa;
+037   
+038       /* increment the used by the shift amount than copy upwards */
+039       a->used += b;
+040   
+041       /* top */
+042       tmpa = a->dp + a->used - 1;
+043   
+044       /* base */
+045       tmpaa = a->dp + a->used - 1 - b;
+046   
+047       /* much like mp_rshd this is implemented using a sliding window
+048        * except the window goes the otherway around.  Copying from
+049        * the bottom to the top.  see bn_mp_rshd.c for more info.
+050        */
+051       for (x = a->used - 1; x >= b; x--) \{
+052         *tmpa-- = *tmpaa--;
+053       \}
+054   
+055       /* zero the lower digits */
+056       tmpa = a->dp;
+057       for (x = 0; x < b; x++) \{
+058         *tmpa++ = 0;
+059       \}
+060     \}
+061     return MP_OKAY;
+062   \}
+\end{alltt}
+\end{small}
+
+The if statement on line 24 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
+the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $tmpa$ on line 42 is an alias
+for the leading digit while $tmpaa$ on line 45 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
+over the input.  
+
+\subsection{Division by $x$}
+
+Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_rshd}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
+\hline \\
+1.  If $b \le 0$ then return. \\
+2.  If $a.used \le b$ then do \\
+\hspace{3mm}2.1  Zero $a$.  (\textit{hint: use mp\_zero}). \\
+\hspace{3mm}2.2  Return. \\
+3.  $i \leftarrow 0$ \\
+4.  $j \leftarrow b$ \\
+5.  for $n$ from 0 to $a.used - b - 1$ do \\
+\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
+\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
+\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
+6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
+\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
+7.  Clamp excess digits.  (\textit{hint: use mp\_clamp}). \\
+8.  Return. \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_rshd}
+\end{figure}
+
+\textbf{Algorithm mp\_rshd.}
+This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
+it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
+
+If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
+to the shift count $b$ then it will simply zero the input and return.
+
+After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
+is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
+Also the digits are copied from the leading to the trailing edge.
+
+Once the window copy is complete the upper digits must be zeroed.  Finally algorithm mp\_clamp is used to trim excess digits.
+
+\index{bn\_mp\_rshd.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right a certain amount of digits */
+018   void
+019   mp_rshd (mp_int * a, int b)
+020   \{
+021     int     x;
+022   
+023     /* if b <= 0 then ignore it */
+024     if (b <= 0) \{
+025       return;
+026     \}
+027   
+028     /* if b > used then simply zero it and return */
+029     if (a->used <= b) \{
+030       mp_zero (a);
+031       return;
+032     \}
+033   
+034     \{
+035       register mp_digit *tmpa, *tmpaa;
+036   
+037       /* shift the digits down */
+038   
+039       /* base */
+040       tmpa = a->dp;
+041   
+042       /* offset into digits */
+043       tmpaa = a->dp + b;
+044   
+045       /* this is implemented as a sliding window where 
+046        * the window is b-digits long and digits from 
+047        * the top of the window are copied to the bottom
+048        *
+049        * e.g.
+050   
+051        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
+052                    /\symbol{92}                   |      ---->
+053                     \symbol{92}-------------------/      ---->
+054        */
+055       for (x = 0; x < (a->used - b); x++) \{
+056         *tmpa++ = *tmpaa++;
+057       \}
+058   
+059       /* zero the top digits */
+060       for (; x < a->used; x++) \{
+061         *tmpa++ = 0;
+062       \}
+063     \}
+064     mp_clamp (a);
+065   \}
+\end{alltt}
+\end{small}
+
+The only noteworthy element of this routine is the lack of a return type.  This function cannot fail and as such it is more optimal to not
+return anything.
+
+\section{Powers of Two}
+
+Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
+example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
+shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
+
+\subsection{Multiplication by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mul\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
+\hline \\
+1.  $c \leftarrow a$.  (\textit{hint: use mp\_copy}) \\
+2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
+3.  If the reallocation failed return(\textit{MP\_MEM}). \\
+4.  If $b \ge lg(\beta)$ then \\
+\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{hint: use mp\_lshd}). \\
+\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
+5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $d \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+\hspace{3mm}6.4  If $r > 0$ then do \\
+\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
+\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
+7.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mul\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mul\_2d.}
+This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
+quickly compute the product.
+
+First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
+$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
+left.
+
+The logarithm of the residue is calculated on step 5.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
+Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
+variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
+
+This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
+complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
+
+\index{bn\_mp\_mul\_2d.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* NOTE:  This routine requires updating.  For instance the c->used = c->all
+      oc bit
+018      is wrong.  We should just shift c->used digits then set the carry as c->d
+      p[c->used] = carry
+019    
+020      To be fixed for LTM 0.18
+021    */
+022   
+023   /* shift left by a certain bit count */
+024   int
+025   mp_mul_2d (mp_int * a, int b, mp_int * c)
+026   \{
+027     mp_digit d;
+028     int      res;
+029   
+030     /* copy */
+031     if (a != c) \{
+032        if ((res = mp_copy (a, c)) != MP_OKAY) \{
+033          return res;
+034        \}
+035     \}
+036   
+037     if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) \{
+038        if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) \{
+039          return res;
+040        \}
+041     \}
+042   
+043     /* shift by as many digits in the bit count */
+044     if (b >= (int)DIGIT_BIT) \{
+045       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
+046         return res;
+047       \}
+048     \}
+049     c->used = c->alloc;
+050   
+051     /* shift any bit count < DIGIT_BIT */
+052     d = (mp_digit) (b % DIGIT_BIT);
+053     if (d != 0) \{
+054       register mp_digit *tmpc, mask, r, rr;
+055       register int x;
+056   
+057       /* bitmask for carries */
+058       mask = (((mp_digit)1) << d) - 1;
+059   
+060       /* alias */
+061       tmpc = c->dp;
+062   
+063       /* carry */
+064       r    = 0;
+065       for (x = 0; x < c->used; x++) \{
+066         /* get the higher bits of the current word */
+067         rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
+068   
+069         /* shift the current word and OR in the carry */
+070         *tmpc = ((*tmpc << d) | r) & MP_MASK;
+071         ++tmpc;
+072   
+073         /* set the carry to the carry bits of the current word */
+074         r = rr;
+075       \}
+076     \}
+077     mp_clamp (c);
+078     return MP_OKAY;
+079   \}
+\end{alltt}
+\end{small}
+
+Notes to be revised when code is updated. -- Tom
+
+\subsection{Division by Power of Two}
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_div\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\
+\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{hint: use mp\_zero}) \\
+\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
+2.  $c \leftarrow a$ \\
+3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\
+4.  If $b \ge lg(\beta)$ then do \\
+\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{hint: use mp\_rshd}). \\
+5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+6.  If $k \ne 0$ then do \\
+\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
+\hspace{3mm}6.2  $r \leftarrow 0$ \\
+\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
+\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
+\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
+\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
+7.  Clamp excess digits of $c$.  (\textit{hint: use mp\_clamp}) \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_div\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_div\_2d.}
+This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
+mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
+by using algorithm mp\_mod\_2d.
+
+\index{bn\_mp\_div\_2d.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* shift right by a certain bit count (store quotient in c, remainder in d) 
+      */
+018   int
+019   mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
+020   \{
+021     mp_digit D, r, rr;
+022     int     x, res;
+023     mp_int  t;
+024   
+025   
+026     /* if the shift count is <= 0 then we do no work */
+027     if (b <= 0) \{
+028       res = mp_copy (a, c);
+029       if (d != NULL) \{
+030         mp_zero (d);
+031       \}
+032       return res;
+033     \}
+034   
+035     if ((res = mp_init (&t)) != MP_OKAY) \{
+036       return res;
+037     \}
+038   
+039     /* get the remainder */
+040     if (d != NULL) \{
+041       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
+042         mp_clear (&t);
+043         return res;
+044       \}
+045     \}
+046   
+047     /* copy */
+048     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+049       mp_clear (&t);
+050       return res;
+051     \}
+052   
+053     /* shift by as many digits in the bit count */
+054     if (b >= (int)DIGIT_BIT) \{
+055       mp_rshd (c, b / DIGIT_BIT);
+056     \}
+057   
+058     /* shift any bit count < DIGIT_BIT */
+059     D = (mp_digit) (b % DIGIT_BIT);
+060     if (D != 0) \{
+061       register mp_digit *tmpc, mask;
+062   
+063       /* mask */
+064       mask = (((mp_digit)1) << D) - 1;
+065   
+066       /* alias */
+067       tmpc = c->dp + (c->used - 1);
+068   
+069       /* carry */
+070       r = 0;
+071       for (x = c->used - 1; x >= 0; x--) \{
+072         /* get the lower  bits of this word in a temp */
+073         rr = *tmpc & mask;
+074   
+075         /* shift the current word and mix in the carry bits from the previous 
+      word */
+076         *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
+077         --tmpc;
+078   
+079         /* set the carry to the carry bits of the current word found above */
+080         r = rr;
+081       \}
+082     \}
+083     mp_clamp (c);
+084     res = MP_OKAY;
+085     if (d != NULL) \{
+086       mp_exch (&t, d);
+087     \}
+088     mp_clear (&t);
+089     return MP_OKAY;
+090   \}
+\end{alltt}
+\end{small}
+
+The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
+ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
+result of the remainder operation until the end.  This allows $d = a$ to be true without overwriting the input before they are no longer required.  
+
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+
+\subsection{Remainder of Division by Power of Two}
+
+The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
+algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
+
+\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_mod\_2d}. \\
+\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
+\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
+\hline \\
+1.  If $b \le 0$ then do \\
+\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{hint: use mp\_zero}) \\
+\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
+2.  If $b > a.used \cdot lg(\beta)$ then do \\
+\hspace{3mm}2.1  $c \leftarrow a$ (\textit{hint: use mp\_copy}) \\
+\hspace{3mm}2.2  Return the result of step 2.1. \\
+3.  $c \leftarrow a$ \\
+4.  If step 3 failed return(\textit{MP\_MEM}). \\
+5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
+\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
+6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
+7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
+8.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_mod\_2d}
+\end{figure}
+
+\textbf{Algorithm mp\_mod\_2d.}
+This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
+result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
+is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
+
+\index{bn\_mp\_mod\_2d.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* calc a value mod 2\b */
+018   int
+019   mp_mod_2d (mp_int * a, int b, mp_int * c)
+020   \{
+021     int     x, res;
+022   
+023   
+024     /* if b is <= 0 then zero the int */
+025     if (b <= 0) \{
+026       mp_zero (c);
+027       return MP_OKAY;
+028     \}
+029   
+030     /* if the modulus is larger than the value than return */
+031     if (b > (int) (a->used * DIGIT_BIT)) \{
+032       res = mp_copy (a, c);
+033       return res;
+034     \}
+035   
+036     /* copy */
+037     if ((res = mp_copy (a, c)) != MP_OKAY) \{
+038       return res;
+039     \}
+040   
+041     /* zero digits above the last digit of the modulus */
+042     for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+
+      +) \{
+043       c->dp[x] = 0;
+044     \}
+045     /* clear the digit that is not completely outside/inside the modulus */
+046     c->dp[b / DIGIT_BIT] &=
+047       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
+      t) 1));
+048     mp_clamp (c);
+049     return MP_OKAY;
+050   \}
+\end{alltt}
+\end{small}
+
+-- Add comments later, Tom.
+
+\section*{Exercises}
+\begin{tabular}{cl}
+$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
+                      & in $O(n)$ time. \\
+                      &\\
+$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
+                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
+                      & upto $64$ with a hamming weight less than three. \\
+                      &\\
+$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
+                      & $2^k - 1$ as well. \\
+                      &\\
+$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
+                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
+                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
+                      & calculation.  \\
+                      & \\
+$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
+                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
+                      & the cost of addition. \\
+                      & \\
+$\left [ 1 \right ] $ & There exists an improvement on the previous algorithm to \\
+                      & slightly reduce the number of additions required.  Modify the \\
+                      & previous algorithm to include this improvement. \\
+                      & \\
+$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
+                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
+                      & \\
+$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
+                      & calculating the result of a signed comparison. \\
+                      &
+\end{tabular}
+
+\chapter{Multiplication and Squaring}
+\section{The Multipliers}
+For most number theoretic systems including public key cryptographic algorithms the set of algorithms collectively known as the
+``multipliers'' form the most important subset of algorithms of any multiple precision integer package.  The set of multipliers 
+include multiplication, squaring and modular reduction algorithms.  
+
+The importance of these algorithms is driven by the fact that most popular public key algorithms are based on modular 
+exponentiation.  That is performing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  Roughly
+speaking the a modular exponentiation will spend about 40\% of the time in modular reductions, 35\% of the time in squaring and 25\% of
+the time in multiplications.  Only a small trivial amount of time is spent on lower level algorithms such as mp\_clamp, mp\_init, etc...
+
+This chapter will discuss only two of the multipliers algorithms, multiplication and squaring.  As will be discussed shortly very efficient
+multiplier algorithms are not always straightforward and deserve a lot of attention.
+
+\section{Multiplication}
+\subsection{The Baseline Multiplication}
+\index{baseline multiplication}
+Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
+algorithm school children are taught.  The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm only called
+when the faster algorithms cannot be used.  This algorithm does not use any particularly interesting optimizations.
+
+The first algorithm to review is the unsigned multiplication algorithm from which a signed multiplication algorithm can be established.  One important 
+facet of this algorithm to note is that it has been modified to only produce a certain amount of output digits as resolution.  Recall that for
+a $n$ and $m$ digit input the product will be at most $n + m + 1$ digits.  Therefore, this algorithm can be reduced to a full multiplier by
+telling it to produce $n + m + 1$ digits.  
+
+Recall from sub-section 5.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend this variable set to 
+include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
+constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 6.2.2 for more information}).
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+1.  If min$(a.used, b.used) < \delta$ then do \\
+\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method.  \\
+\hspace{3mm}1.2  Return the result of step 1.1 \\
+\\
+Allocate and initialize a temporary mp\_int. \\
+2.  Init $t$ to be of size $digs$ \\
+3.  If step 2 failed return(\textit{MP\_MEM}). \\
+4.  $t.used \leftarrow digs$ \\
+\\
+Compute the product. \\
+5.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}5.1  $u \leftarrow 0$ \\
+\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
+\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
+\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
+\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
+\hspace{3mm}5.5  if $ix + iy < digs$ then do \\
+\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
+6.  Clamp excess digits of $t$. \\
+7.  Swap $c$ with $t$ \\
+8.  Clear $t$ \\
+9.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm s\_mp\_mul\_digs.}
+This algorithm computes the unsigned product of two inputs $a$ and $c$ limited to an output precision of $digs$ digits.  While it may seem
+a bit awkward to modify the function from its simple $O(n^2)$ description the usefulness of partial multipliers will arise in a future 
+algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M \cite[pp. 268]{TAOCPV2}.  The
+algorithm differs from those cited references because it can produce a variable output precision regardless of the precision of the inputs.
+
+The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   That is if the minimal digit count of either
+input is less than $\delta$ the Comba method is used.    After the Comba method is ruled out the baseline algorithm begins.  A 
+temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
+compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
+
+All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
+is given the count of digits to read from $b$ inside the nested loop.  If $pb < 0$ then no more output digits can be produced and the algorithm
+will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplication.    That is, in each pass of the 
+innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
+
+For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
+visualized as the following table.
+
+\begin{figure}[here]
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|c|}
+\hline   &&          & 5 & 7 & 6 & \\
+\hline   $\times$&&  & 2 & 4 & 1 & \\
+\hline &&&&&&\\
+  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
+  &2 &   3    & 0 & 4 & 0 & $10^1(4)(576)$ \\
+  1 & 1 & 5 & 2 & 0 & 0 &  $10^2(2)(576)$ \\
+\hline  
+\end{tabular}
+\end{center}
+\caption{Long-Hand Multiplication Diagram}
+\end{figure}
+
+Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
+count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
+
+Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat x$}) which represents a double precision variable.  The multiplication on that step
+is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
+double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
+5.4.1 is forwarded through the nested loop.  If the carry was ignored it would overflow the single precision digit $t_{ix+iy}$ and the result
+would be lost.  
+
+At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  That is provided $ix + iy < digs$ otherwise the
+carry is ignored since it will not be part of the result anyways.  
+
+\index{bn\_s\_mp\_mul\_digs.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* multiplies |a| * |b| and only computes upto digs digits of result
+018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
+019    * many digits of output are created.
+020    */
+021   int
+022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+023   \{
+024     mp_int  t;
+025     int     res, pa, pb, ix, iy;
+026     mp_digit u;
+027     mp_word r;
+028     mp_digit tmpx, *tmpt, *tmpy;
+029   
+030     /* can we use the fast multiplier? */
+031     if (((digs) < MP_WARRAY) &&
+032         MIN (a->used, b->used) < 
+033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+034       return fast_s_mp_mul_digs (a, b, c, digs);
+035     \}
+036   
+037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
+038       return res;
+039     \}
+040     t.used = digs;
+041   
+042     /* compute the digits of the product directly */
+043     pa = a->used;
+044     for (ix = 0; ix < pa; ix++) \{
+045       /* set the carry to zero */
+046       u = 0;
+047   
+048       /* limit ourselves to making digs digits of output */
+049       pb = MIN (b->used, digs - ix);
+050   
+051       /* setup some aliases */
+052       /* copy of the digit from a used within the nested loop */
+053       tmpx = a->dp[ix];
+054       
+055       /* an alias for the destination shifted ix places */
+056       tmpt = t.dp + ix;
+057       
+058       /* an alias for the digits of b */
+059       tmpy = b->dp;
+060   
+061       /* compute the columns of the output and propagate the carry */
+062       for (iy = 0; iy < pb; iy++) \{
+063         /* compute the column as a mp_word */
+064         r = ((mp_word) *tmpt) + 
+065             ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
+066             ((mp_word) u);
+067   
+068         /* the new column is the lower part of the result */
+069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+070   
+071         /* get the carry word from the result */
+072         u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+073       \}
+074       /* set carry if it is placed below digs */
+075       if (ix + iy < digs) \{
+076         *tmpt = u;
+077       \}
+078     \}
+079   
+080     mp_clamp (&t);
+081     mp_exch (&t, c);
+082   
+083     mp_clear (&t);
+084     return MP_OKAY;
+085   \}
+\end{alltt}
+\end{small}
+
+Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
+the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By
+default it is set to $\delta$ but can be reduced when memory is at a premium.
+
+Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
+variables are cast to the type \textbf{mp\_word}.  That is to ensure that double precision operations are used instead of single precision.  The
+multiplication on line 65 is a bit of a GCC optimization.  On the outset it looks like the compiler will have to use a double precision
+multiplication to produce the result required.  Such an operation would be horribly slow on most processors and drag this to a crawl.  However,
+GCC is smart enough to realize that double wide output single precision multipliers can be used.  For example, the instruction ``MUL'' on the
+x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+
+\subsection{Faster Multiplication by the ``Comba'' Method}
+
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
+makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' method is named after little known 
+(\textit{in cryptographic venues}) Paul G. Comba where in \cite{COMBA} a method of implementing fast multipliers that do not require nested 
+carry fixup operations was presented.
+
+At the heart of algorithm is once again the long-hand algorithm for multiplication.  Except in this case a slight twist is placed on how
+the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
+final result.  In the baseline algorithm the columns are added together to get the result instantaneously.  
+
+In the Comba algorithm however, the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
+simple multiplication and addition step is performed.  Or more succintly that 
+
+\begin{equation}
+x_n = \sum_{i+j = n} a_ib_j
+\end{equation}
+
+Where $x_n$ is the $n'th$ column of the output vector.  To see how this works consider once again multiplying $576$ by $241$.  
+
+\begin{figure}[here]
+\begin{small}
+\begin{center}
+\begin{tabular}{|c|c|c|c|c|c|}
+  \hline &          & 5 & 7 & 6 & First Input\\
+  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
+\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
+                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
+   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
+\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
+\hline   
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Comba Multiplication Diagram}
+\end{figure}
+
+At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
+Now the columns must be fixed by propagating the carry upwards.  The following trivial algorithm will accomplish this.
+
+\begin{enumerate}
+    \item for $n$ from 0 to $k - 1$ do
+    \item \hspace{3mm} $x_{n+1} \leftarrow x_{n+1} + \lfloor x_{n}/\beta \rfloor$ 
+    \item \hspace{3mm} $x_{n} \leftarrow x_{n} \mbox{ (mod }\beta\mbox{)}$
+\end{enumerate}
+
+With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $y = \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
+$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
+efficient than the baseline algorithm why not simply always use this algorithm?
+
+\subsubsection{Column Weight.}
+At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to a each column of the output 
+independently.  A serious obstacle is if the carry is lost due to lack of precision before the algorithm has a chance to fix
+the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
+three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
+an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit input the maximal weight of any column is 
+min$(m, n)$ which is fairly obvious.
+
+The maximal number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
+from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
+two quantities we may not violate the following
+
+\begin{equation}
+k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
+\end{equation}
+
+Which reduces to 
+
+\begin{equation}
+k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
+found.
+
+\begin{equation}
+k \cdot \left (2^{2\rho} - 2^{\rho + 1} + 1 \right ) < 2^{\alpha}
+\end{equation}
+
+The defaults for LibTomMath are $\beta = 2^{28}, \alpha = 2^{64}$ which simplies to $72057593501057025 \cdot k < 2^{64}$ which when divided out
+result in $k < 257$.  This implies that the smallest input may not have more than $256$ digits if the Comba method is to be used in
+this configuration.  This is quite satisfactory for most applications since $256$ digits would be allow for numbers in the range of $2^{7168}$ 
+which is much larger than the typical $2^{100}$ to $2^{4000}$ range most public key cryptographic algorithms use.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
+\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
+\hline \\
+Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{hint: use mp\_grow}) \\
+2.  If step 1 failed return(\textit{MP\_MEM}).\\
+\\
+Zero the temporary array $\hat W$. \\
+3.  for $n$ from $0$ to $digs - 1$ do \\
+\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+\\
+Compute the columns. \\
+4.  for $ix$ from $0$ to $a.used - 1$ do \\
+\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
+\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
+\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
+\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+\\
+Propagate the carries upwards. \\
+5.  $oldused \leftarrow c.used$ \\
+6.  $c.used \leftarrow digs$ \\
+7.  If $digs > 1$ then do \\
+\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
+\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
+\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
+8.  else do \\
+\hspace{3mm}8.1  $ix \leftarrow 0$ \\
+9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+\\
+Zero excess digits. \\
+10.  If $digs < oldused$ then do \\
+\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
+\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
+11.  Clamp excessive digits of $c$.  (\textit{hint: use mp\_clamp}) \\
+12.  Return(\textit{MP\_OKAY}). \\
+\hline
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm fast\_s\_mp\_mul\_digs}
+\end{figure}
+
+\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
+essentially peforms the same calculation as algorithm s\_mp\_mul\_digs but much faster.
+
+The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
+unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated in place in $\hat W$.  
+
+The $O(n^2)$ loop on step four is where the Comba method starts to show through.  First there is no carry variable in the loop.  Second the
+double precision multiply and add step does not have a carry fixup of any sort.  In fact the nested loop is very simple and can be implemented
+in parallel.  
+
+What makes the Comba method so attractive is that the carry propagation only takes place outside the $O(n^2)$ nested loop.  For example, if the 
+cost in terms of time of a multiply and add is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
+$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method only requires $pn^2 + qn$ time, however, in practice 
+the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
+and add operations in the nested loop in parallel.  
+
+The carry propagation loop on step 7 is fairly straightforward.  It could have been written phased the other direction, that is, to assign
+to $c_{ix}$ instead of $c_{ix-1}$ in each iteration.  However, it would still require pre-caution to make sure that $\hat W_{ix+1}$ is not beyond
+the \textbf{MP\_WARRAY} words set aside.  
+
+\index{bn\_fast\_s\_mp\_mul\_digs.c}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
+\vspace{-3mm}
+\begin{alltt}
+016   
+017   /* Fast (comba) multiplier
+018    *
+019    * This is the fast column-array [comba] multiplier.  It is 
+020    * designed to compute the columns of the product first 
+021    * then handle the carries afterwards.  This has the effect 
+022    * of making the nested loops that compute the columns very
+023    * simple and schedulable on super-scalar processors.
+024    *
+025    * This has been modified to produce a variable number of 
+026    * digits of output so if say only a half-product is required 
+027    * you don't have to compute the upper half (a feature 
+028    * required for fast Barrett reduction).
+029    *
+030    * Based on Algorithm 14.12 on pp.595 of HAC.
+031    *
+032    */
+033   int
+034   fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+035   \{
+036     int     olduse, res, pa, ix;
+037     mp_word W[MP_WARRAY];
+038   
+039     /* grow the destination as required */
+040     if (c->alloc < digs) \{
+041       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
+042         return res;
+043       \}
+044     \}
+045   
+046     /* clear temp buf (the columns) */
+047     memset (W, 0, sizeof (mp_word) * digs);
+048   
+049     /* calculate the columns */
+050     pa = a->used;
+051     for (ix = 0; ix < pa; ix++) \{
+052       /* this multiplier has been modified to allow you to 
+053        * control how many digits of output are produced.  
+054        * So at most we want to make upto "digs" digits of output.
+055        *
+056        * this adds products to distinct columns (at ix+iy) of W
+057        * note that each step through the loop is not dependent on
+058        * the previous which means the compiler can easily unroll
+059        * the loop without scheduling problems
+060        */
+061       \{
+062         register mp_digit tmpx, *tmpy;
+063         register mp_word *_W;
+064         register int iy, pb;
+065   
+066         /* alias for the the word on the left e.g. A[ix] * A[iy] */
+067         tmpx = a->dp[ix];
+068   
+069         /* alias for the right side */
+070         tmpy = b->dp;
+071   
+072         /* alias for the columns, each step through the loop adds a new
+073            term to each column
+074          */
+075         _W = W + ix;
+076   
+077         /* the number of digits is limited by their placement.  E.g.
+078            we avoid multiplying digits that will end up above the # of
+079            digits of precision requested
+080          */
+081         pb = MIN (b->used, digs - ix);
+082   
+083         for (iy = 0; iy < pb; iy++) \{
+084           *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
+085         \}
+086       \}
+087   
+088     \}
+089   
+090     /* setup dest */
+091     olduse = c->used;
+092     c->used = digs;
+093   
+094     \{
+095       register mp_digit *tmpc;
+096   
+097       /* At this point W[] contains the sums of each column.  To get the
+098        * correct result we must take the extra bits from each column and
+099        * carry them down
+100        *
+101        * Note that while this adds extra code to the multiplier it 
+102        * saves time since the carry propagation is removed from the 
+103        * above nested loop.This has the effect of reducing the work 
+104        * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
+105        * cost of the shifting.  On very small numbers this is slower 
+106        * but on most cryptographic size numbers it is faster.
+107        */
+108       tmpc = c->dp;
+109       for (ix = 1; ix < digs; ix++) \{
+110         W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+111         *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+112       \}
+113       *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
+114   
+115       /* clear unused */
+116       for (; ix < olduse; ix++) \{
+117         *tmpc++ = 0;
+118       \}
+119     \}
+120   
+121     mp_clamp (c);
+122     return MP_OKAY;
+123   \}
+\end{alltt}
+\end{small}
+
+The memset on line 47 clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
+implementation a series of aliases (\textit{lines 67, 70 and 75}) are used to simplify the inner $O(n^2)$ loop.  
+In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+
+The inner loop on line 84 is where the algorithm will spend the majority of the time.  Which is why it has been stripped to the 
+bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiply and add amounts to at the very least five
+instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors it amounts to only three (\textit{one load, one store,
+one multiply-add}).   On both the x86 and ARMv4 processors GCC v3.2 does a very good job at unrolling the loop and scheduling it so there 
+are very few dependency stalls.
+
+In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
+baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
+digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
+be simultaneously used.  
+
+\subsection{Multiplication at New Bounds by Karatsuba Method}
+So far two methods of multiplication have been presented.  Both of the algorithms require asymptotically $O(n^2)$ time to multiply two $n$-digit 
+numbers together.  While the Comba method is much faster than the baseline algorithm it still requires far too much time to multiply 
+large inputs together.  In fact it was not until \cite{KARA} in 1962 that a faster algorithm had been proposed at all.
+
+The idea behind Karatsubas method is that an input can be represented in polynomial basis as two halves then multiplied.  For example, if 
+$f(x) = ax + b$ and $g(x) = cx + b$ then the product of the two polynomials $h(x) = f(x)g(x)$ will allow $h(\beta) = (f(\beta))(g(\beta))$.  
+
+So how does this help?  First expand the product $h(x)$.
+
+\begin{center}
+\begin{tabular}{rcl}
+$h(x)$ & $=$ & $f(x)g(x)$ \\
+       & $=$ & $(ax + b)(cx + d)$ \\
+       & $=$ & $acx^2 + adx + bcx + bd$ \\
+\end{tabular}
+\end{center}
+
+The next equation is a bit of genius on the part of Karatsuba.  He proved that the previous equation is equivalent to 
+
+\begin{equation}
+h(x) = acx^2 + ((a - c)(b - d) + bd + ac)x + bd
+\end{equation}
+
+Essentially the proof lies in some fairly light algebraic number theory (\textit{see \cite{KARAP} for details}) that is not important for
+the discussion.  At first glance it appears that the Karatsuba method is actually harder than the straight $O(n^2)$ approach.  
+However, further investigation will prove otherwise.  
+
+The first important observation is that both $f(x)$ and $g(x)$ are the polynomial basis representation of two-digit numbers.  This means that 
+$\left < a, b, c, d \right >$ are single digit values.  Using either the baseline or straight polynomial multiplication the old method requires
+$O \left (4(n/2)^2 \right ) = O(n^2)$ single precision multiplications.  Looking closer at Karatsubas equation there are only three unique multiplications 
+required which are $ac$, $bd$ and $(a - c)(b - d)$.  As a result only $O \left (3 \cdot (n/2)^2 \right ) = O \left ( {3 \over 4}n^2 \right )$ 
+multiplications are required.  
+
+So far the algorithm has been discussed from the point of view of ``two-digit'' numbers.  However, there is no reason why two digits implies a range of 
+$\beta^2$.  It could just as easily represent a range of $\left (\beta^k \right)^2$ as well.  For example, the polynomial 
+$f(x) = a_3x^3 + a_2x^2 + a_1x + a_0$ could also be written as $f'(x) = a'_1x + a'_0$ where $f(\beta) = f'(\beta^2)$.  Fortunately representing an
+integer which is already in an array of radix-$\beta$ digits in polynomial basis in terms of a power of $\beta$ is very simple.  
+
+\subsubsection{Recursion}
+The Karatsuba multiplication algorithm can be applied to practically any size of input.  Therefore, it is possible that the Karatsuba method itself
+be used for the three multiplications required.  For example, when multiplying two four-digit numbers there will be three multiplications of two-digit
+numbers.  In this case the smaller multiplication requires $p(n) = {3 \over 4}n^2$ time to complete while the larger multiplication requires
+$q(n) = 3 \cdot p(n/2)$ multiplications.  
+
+By expanding $q(n)$ the following equation is achieved. 
+
+\begin{center}
+\begin{tabular}{rcl}
+$q(n)$ & $=$ & $3 \cdot p(n/2)$ \\
+       & $=$ & $3 \cdot (3 \cdot ((n/2)/2)^2)$ \\
+       & $=$ & $9 \cdot (n/4)^2$ \\
+       & $=$ & ${9 \over 16}n^2$ \\
+\end{tabular}
+\end{center}
+
+The generic expression for the multiplicand is simply $\left ( {3 \over 4} \right )^k$ for $k \ge 1$ recurisions.  The maximal number of recursions
+is approximately $lg(n)$.  Putting this all in terms of a base $n$ logarithm the asymptotic running time can be deduced.
+
+\begin{center}
+\begin{tabular}{rcl}
+$lg_n \left ( \left ( {3 \over 4} \right )^{lg_2 n} \cdot n^2 \right )$ & $=$ & $lg_2 n \cdot lg_n \left ( { 3 \over 4 } \right ) + 2$ \\
+                                                                        & $=$ & $\left ( {log N \over log 2} \right ) \cdot \left ( {log \left ( {3 \over 4} \right ) \over log N } \right ) + 2$ \\
+                                                                        & $=$ & ${ log 3 - log 2^2 + 2 \cdot log 2} \over log 2$ \\
+                                                                        & $=$ & $log 3 \over log 2$ \\
+\end{tabular}
+\end{center}
+
+Which leads to a running time of $O \left ( n^{lg(3)} \right )$ which is approximately $O(n^{1.584})$.  This can lead to 
+impressive savings with fairly moderate sized numbers.  For example, when multiplying two 128-digit numbers the Karatsuba 
+method saves $14,197$ (\textit{or $86\%$ of the total}) single precision multiplications.  
+
+The immediate question becomes why not simply use Karatsuba multiplication all the time and forget about the baseline and Comba algorithms? 
+
+\subsubsection{Overhead}
+While the Karatsuba method saves on the number of single precision multiplications required this savings is not entirely free.  The product
+of three half size products must be stored somewhere as well as four additions and two subtractions performed.  These operations incur sufficient
+overhead that often for fairly trivial sized inputs the Karatsuba method is slower.
+
+\index{cutoff point}
+The \textit{cutoff point} for Karatsuba multiplication is the point at which the Karatsuba multiplication and baseline (\textit{or Comba}) meet.  
+For the purposes of this discussion call this value $x$.  For any input with $n$ digits such that $n < x$ Karatsuba multiplication will be slower 
+and for $n > x$ it will be faster.  Often the break between the two algorithms is not so clean cut in reality.  The cleaner the cut the more 
+efficient multiplication will be which is why tuning the multiplication is a very important process.  For example, a properly tuned Karatsuba 
+multiplication algorithm can multiply two $4,096$ bit numbers up to five times faster on an Athlon processor compared to the standard baseline
+algorithm.  
+
+The exact placement of the value of $x$ depends on several key factors.   The cost of allocating storage for the temporary variables, the cost of 
+performing the additions and most importantly the cost of performing a single precision multiplication.  With a processor where single precision 
+multiplication is fast\footnote{The AMD Athlon for instance has a six cycle multiplier compared to the Intel P4 which has a 15 cycle multiplier.} the 
+cutoff point will move upwards.  Similarly with a slower processor the cutoff point will move downwards.  
+
+\newpage\begin{figure}[!here]
+\begin{small}
+\begin{center}
+\begin{tabular}{l}
+\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
+\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
+\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
+\hline \\
+1.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
+2.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
+3.  If step 2 failed then return(\textit{MP\_MEM}). \\
+\\
+Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
+4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{hint: use mp\_mod\_2d}) \\
+5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
+6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{hint: use mp\_rshd}) \\
+7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
+\\
+Calculate the three products. \\
+8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{hint: use mp\_mul}) \\
+9.  $x1y1 \leftarrow x1 \cdot y1$ \\
+10.  $t1 \leftarrow x1 - x0$ (\textit{hint: use mp\_sub}) \\
+11.  $x0 \leftarrow y1 - y0$ \\
+12.  $t1 \leftarrow t1 \cdot x0$ \\
+\\
+Calculate the middle term. \\
+13.  $x0 \leftarrow x0y0 + x1y1$ \\
+14.  $t1 \leftarrow x0 - t1$ \\
+\\
+Calculate the final product. \\
+15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{hint: use mp\_lshd}) \\
+16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
+17.  $t1 \leftarrow x0y0 + t1$ \\
+18.  $c \leftarrow t1 + x1y1$ \\
+19.  Clear all of the temporary variables. \\
+20.  Return(\textit{MP\_OKAY}).\\
+\hline 
+\end{tabular}
+\end{center}
+\end{small}
+\caption{Algorithm mp\_karatsuba\_mul}
+\end{figure}
+
+\textbf{Algorithm mp\_karatsuba\_mul.}
+
+
+\section{Squaring}
+\subsection{The Baseline Squaring Algorithm}
+\subsection{Faster Squaring by the ``Comba'' Method}
+\subsection{Karatsuba Squaring}
+\section{Tuning Algorithms}
+\subsection{How to Tune Karatsuba Algorithms}
+
+\chapter{Modular Reductions}
+\section{Basics of Modular Reduction}
+\section{The Barrett Reduction}
+\section{The Montgomery Reduction}
+\subsection{Faster ``Comba'' Montgomery Reduction}
+\subsection{Example Montgomery Algorithms}
+\section{The Diminished Radix Algorithm}
+\section{Algorithm Comparison}
+
+\chapter{Exponentiation}
+\section{Single Digit Exponentiation}
+\section{Modular Exponentiation}
+\subsection{General Case}
+\subsection{Odd or Diminished Radix Moduli}
+\section{Quick Power of Two}
+
+\chapter{Higher Level Algorithms}
+\section{Integer Division with Remainder}
+\section{Single Digit Helpers}
+\subsection{Single Digit Addition}
+\subsection{Single Digit Subtraction}
+\subsection{Single Digit Multiplication}
+\subsection{Single Digit Division}
+\subsection{Single Digit Modulo}
+\subsection{Single Digit Root Extraction}
+\section{Random Number Generation}
+\section{Formatted Output}
+\subsection{Getting The Output Size}
+\subsection{Generating Radix-n Output}
+\subsection{Reading Radix-n Input}
+\section{Unformatted Output}
+\subsection{Getting The Output Size}
+\subsection{Generating Output}
+\subsection{Reading Input}
+
+\chapter{Number Theoretic Algorithms}
+\section{Greatest Common Divisor}
+\section{Least Common Multiple}
+\section{Jacobi Symbol Computation}
+\section{Modular Inverse}
+\subsection{General Case}
+\subsection{Odd Moduli}
+\section{Primality Tests}
+\subsection{Trial Division}
+\subsection{The Fermat Test}
+\subsection{The Miller-Rabin Test}
+\subsection{Primality Test in a Bottle}
+\subsection{The Next Prime}
+\section{Root Extraction}
+
+\backmatter
+\appendix
+\begin{thebibliography}{ABCDEF}
+\bibitem[1]{TAOCPV2}
+Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
+
+\bibitem[2]{HAC}
+A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
+
+\bibitem[3]{ROSE}
+Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
+
+\bibitem[4]{COMBA}
+Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
+
+\bibitem[5]{KARA}
+A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
+
+\bibitem[6]{KARAP}
+Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
+
+\end{thebibliography}
+
+\input{tommath.ind}
+
+\chapter{Appendix}
+\subsection*{Appendix A -- Source Listing of tommath.h}
+
+The following is the source listing of the header file ``tommath.h'' for the LibTomMath project.  It contains many of 
+the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on.  The header is 
+presented here for completeness.
+
+\index{tommath.h}
+\vspace{+3mm}\begin{small}
+\hspace{-5.1mm}{\bf File}: tommath.h
+\vspace{-3mm}
+\begin{alltt}
+001   /* LibTomMath, multiple-precision integer library -- Tom St Denis
+002    *
+003    * LibTomMath is library that provides for multiple-precision
+004    * integer arithmetic as well as number theoretic functionality.
+005    *
+006    * The library is designed directly after the MPI library by
+007    * Michael Fromberger but has been written from scratch with
+008    * additional optimizations in place.
+009    *
+010    * The library is free for all purposes without any express
+011    * guarantee it works.
+012    *
+013    * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+014    */
+015   #ifndef BN_H_
+016   #define BN_H_
+017   
+018   #include <stdio.h>
+019   #include <string.h>
+020   #include <stdlib.h>
+021   #include <ctype.h>
+022   #include <limits.h>
+023   
+024   #undef MIN
+025   #define MIN(x,y) ((x)<(y)?(x):(y))
+026   #undef MAX
+027   #define MAX(x,y) ((x)>(y)?(x):(y))
+028   
+029   #ifdef __cplusplus
+030   extern "C" \{
+031   
+032   /* C++ compilers don't like assigning void * to mp_digit * */
+033   #define  OPT_CAST  (mp_digit *)
+034   
+035   #else
+036   
+037   /* C on the other hand doesn't care */
+038   #define  OPT_CAST
+039   
+040   #endif
+041   
+042   /* some default configurations.
+043    *
+044    * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
+045    * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
+046    *
+047    * At the very least a mp_digit must be able to hold 7 bits
+048    * [any size beyond that is ok provided it doesn't overflow the data type]
+049    */
+050   #ifdef MP_8BIT
+051      typedef unsigned char      mp_digit;
+052      typedef unsigned short     mp_word;
+053   #elif defined(MP_16BIT)
+054      typedef unsigned short     mp_digit;
+055      typedef unsigned long      mp_word;
+056   #elif defined(MP_64BIT)
+057      /* for GCC only on supported platforms */
+058   #ifndef CRYPT
+059      typedef unsigned long long ulong64;
+060      typedef signed long long   long64;
+061   #endif
+062   
+063      typedef ulong64            mp_digit;
+064      typedef unsigned long      mp_word __attribute__ ((mode(TI)));
+065   
+066      #define DIGIT_BIT          60
+067   #else
+068      /* this is the default case, 28-bit digits */
+069      
+070      /* this is to make porting into LibTomCrypt easier :-) */
+071   #ifndef CRYPT
+072      #ifdef _MSC_VER
+073         typedef unsigned __int64   ulong64;
+074         typedef signed __int64     long64;
+075      #else
+076         typedef unsigned long long ulong64;
+077         typedef signed long long   long64;
+078      #endif
+079   #endif
+080   
+081      typedef unsigned long      mp_digit;
+082      typedef ulong64            mp_word;
+083   
+084      #define DIGIT_BIT          28
+085   #endif
+086   
+087   /* otherwise the bits per digit is calculated automatically from the size of
+       a mp_digit */
+088   #ifndef DIGIT_BIT
+089      #define DIGIT_BIT     ((CHAR_BIT * sizeof(mp_digit) - 1))  /* bits per di
+      git */
+090   #endif
+091   
+092   
+093   #define MP_DIGIT_BIT     DIGIT_BIT
+094   #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)
+      1))
+095   #define MP_DIGIT_MAX     MP_MASK
+096   
+097   /* equalities */
+098   #define MP_LT        -1   /* less than */
+099   #define MP_EQ         0   /* equal to */
+100   #define MP_GT         1   /* greater than */
+101   
+102   #define MP_ZPOS       0   /* positive integer */
+103   #define MP_NEG        1   /* negative */
+104   
+105   #define MP_OKAY       0   /* ok result */
+106   #define MP_MEM        -2  /* out of mem */
+107   #define MP_VAL        -3  /* invalid input */
+108   #define MP_RANGE      MP_VAL
+109   
+110   typedef int           mp_err;
+111   
+112   /* you'll have to tune these... */
+113   extern int KARATSUBA_MUL_CUTOFF,
+114              KARATSUBA_SQR_CUTOFF,
+115              MONTGOMERY_EXPT_CUTOFF;
+116   
+117   /* various build options */
+118   #define MP_PREC                 64      /* default digits of precision (must
+       be power of two) */
+119   
+120   /* define this to use lower memory usage routines (exptmods mostly) */
+121   /* #define MP_LOW_MEM */
+122   
+123   /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER
+      _DIGIT*2) */
+124   #define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGI
+      T_BIT + 1))
+125   
+126   typedef struct  \{
+127       int used, alloc, sign;
+128       mp_digit *dp;
+129   \} mp_int;
+130   
+131   #define USED(m)    ((m)->used)
+132   #define DIGIT(m,k) ((m)->dp[k])
+133   #define SIGN(m)    ((m)->sign)
+134   
+135   /* ---> init and deinit bignum functions <--- */
+136   
+137   /* init a bignum */
+138   int mp_init(mp_int *a);
+139   
+140   /* free a bignum */
+141   void mp_clear(mp_int *a);
+142   
+143   /* init a null terminated series of arguments */
+144   int mp_init_multi(mp_int *mp, ...);
+145   
+146   /* clear a null terminated series of arguments */
+147   void mp_clear_multi(mp_int *mp, ...);
+148   
+149   /* exchange two ints */
+150   void mp_exch(mp_int *a, mp_int *b);
+151   
+152   /* shrink ram required for a bignum */
+153   int mp_shrink(mp_int *a);
+154   
+155   /* grow an int to a given size */
+156   int mp_grow(mp_int *a, int size);
+157   
+158   /* init to a given number of digits */
+159   int mp_init_size(mp_int *a, int size);
+160   
+161   /* ---> Basic Manipulations <--- */
+162   
+163   #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
+164   #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
+165   #define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
+166   
+167   /* set to zero */
+168   void mp_zero(mp_int *a);
+169   
+170   /* set to a digit */
+171   void mp_set(mp_int *a, mp_digit b);
+172   
+173   /* set a 32-bit const */
+174   int mp_set_int(mp_int *a, unsigned int b);
+175   
+176   /* copy, b = a */
+177   int mp_copy(mp_int *a, mp_int *b);
+178   
+179   /* inits and copies, a = b */
+180   int mp_init_copy(mp_int *a, mp_int *b);
+181   
+182   /* trim unused digits */
+183   void mp_clamp(mp_int *a);
+184   
+185   /* ---> digit manipulation <--- */
+186   
+187   /* right shift by "b" digits */
+188   void mp_rshd(mp_int *a, int b);
+189   
+190   /* left shift by "b" digits */
+191   int mp_lshd(mp_int *a, int b);
+192   
+193   /* c = a / 2**b */
+194   int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
+195   
+196   /* b = a/2 */
+197   int mp_div_2(mp_int *a, mp_int *b);
+198   
+199   /* c = a * 2**b */
+200   int mp_mul_2d(mp_int *a, int b, mp_int *c);
+201   
+202   /* b = a*2 */
+203   int mp_mul_2(mp_int *a, mp_int *b);
+204   
+205   /* c = a mod 2**d */
+206   int mp_mod_2d(mp_int *a, int b, mp_int *c);
+207   
+208   /* computes a = 2**b */
+209   int mp_2expt(mp_int *a, int b);
+210   
+211   /* makes a pseudo-random int of a given size */
+212   int mp_rand(mp_int *a, int digits);
+213   
+214   /* ---> binary operations <--- */
+215   /* c = a XOR b  */
+216   int mp_xor(mp_int *a, mp_int *b, mp_int *c);
+217   
+218   /* c = a OR b */
+219   int mp_or(mp_int *a, mp_int *b, mp_int *c);
+220   
+221   /* c = a AND b */
+222   int mp_and(mp_int *a, mp_int *b, mp_int *c);
+223   
+224   /* ---> Basic arithmetic <--- */
+225   
+226   /* b = -a */
+227   int mp_neg(mp_int *a, mp_int *b);
+228   
+229   /* b = |a| */
+230   int mp_abs(mp_int *a, mp_int *b);
+231   
+232   /* compare a to b */
+233   int mp_cmp(mp_int *a, mp_int *b);
+234   
+235   /* compare |a| to |b| */
+236   int mp_cmp_mag(mp_int *a, mp_int *b);
+237   
+238   /* c = a + b */
+239   int mp_add(mp_int *a, mp_int *b, mp_int *c);
+240   
+241   /* c = a - b */
+242   int mp_sub(mp_int *a, mp_int *b, mp_int *c);
+243   
+244   /* c = a * b */
+245   int mp_mul(mp_int *a, mp_int *b, mp_int *c);
+246   
+247   /* b = a*a  */
+248   int mp_sqr(mp_int *a, mp_int *b);
+249   
+250   /* a/b => cb + d == a */
+251   int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+252   
+253   /* c = a mod b, 0 <= c < b  */
+254   int mp_mod(mp_int *a, mp_int *b, mp_int *c);
+255   
+256   /* ---> single digit functions <--- */
+257   
+258   /* compare against a single digit */
+259   int mp_cmp_d(mp_int *a, mp_digit b);
+260   
+261   /* c = a + b */
+262   int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+263   
+264   /* c = a - b */
+265   int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+266   
+267   /* c = a * b */
+268   int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+269   
+270   /* a/b => cb + d == a */
+271   int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+272   
+273   /* c = a**b */
+274   int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);
+275   
+276   /* c = a mod b, 0 <= c < b  */
+277   int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+278   
+279   /* ---> number theory <--- */
+280   
+281   /* d = a + b (mod c) */
+282   int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+283   
+284   /* d = a - b (mod c) */
+285   int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+286   
+287   /* d = a * b (mod c) */
+288   int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+289   
+290   /* c = a * a (mod b) */
+291   int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
+292   
+293   /* c = 1/a (mod b) */
+294   int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+295   
+296   /* c = (a, b) */
+297   int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+298   
+299   /* c = [a, b] or (a*b)/(a, b) */
+300   int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+301   
+302   /* finds one of the b'th root of a, such that |c|**b <= |a|
+303    *
+304    * returns error if a < 0 and b is even
+305    */
+306   int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
+307   
+308   /* shortcut for square root */
+309   #define mp_sqrt(a, b) mp_n_root(a, 2, b)
+310   
+311   /* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
+312   int mp_jacobi(mp_int *a, mp_int *n, int *c);
+313   
+314   /* used to setup the Barrett reduction for a given modulus b */
+315   int mp_reduce_setup(mp_int *a, mp_int *b);
+316   
+317   /* Barrett Reduction, computes a (mod b) with a precomputed value c
+318    *
+319    * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
+320    * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
+321    */
+322   int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+323   
+324   /* setups the montgomery reduction */
+325   int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+326   
+327   /* computes a = B**n mod b without division or multiplication useful for
+328    * normalizing numbers in a Montgomery system.
+329    */
+330   int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+331   
+332   /* computes x/R == x (mod N) via Montgomery Reduction */
+333   int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+334   
+335   /* returns 1 if a is a valid DR modulus */
+336   int mp_dr_is_modulus(mp_int *a);
+337   
+338   /* sets the value of "d" required for mp_dr_reduce */
+339   void mp_dr_setup(mp_int *a, mp_digit *d);
+340   
+341   /* reduces a modulo b using the Diminished Radix method */
+342   int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
+343   
+344   /* d = a**b (mod c) */
+345   int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+346   
+347   /* ---> Primes <--- */
+348   
+349   /* number of primes */
+350   #ifdef MP_8BIT
+351      #define PRIME_SIZE      31
+352   #else
+353      #define PRIME_SIZE      256
+354   #endif
+355   
+356   /* table of first PRIME_SIZE primes */
+357   extern const mp_digit __prime_tab[];
+358   
+359   /* result=1 if a is divisible by one of the first PRIME_SIZE primes */
+360   int mp_prime_is_divisible(mp_int *a, int *result);
+361   
+362   /* performs one Fermat test of "a" using base "b".
+363    * Sets result to 0 if composite or 1 if probable prime
+364    */
+365   int mp_prime_fermat(mp_int *a, mp_int *b, int *result);
+366   
+367   /* performs one Miller-Rabin test of "a" using base "b".
+368    * Sets result to 0 if composite or 1 if probable prime
+369    */
+370   int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result);
+371   
+372   /* performs t rounds of Miller-Rabin on "a" using the first
+373    * t prime bases.  Also performs an initial sieve of trial
+374    * division.  Determines if "a" is prime with probability
+375    * of error no more than (1/4)**t.
+376    *
+377    * Sets result to 1 if probably prime, 0 otherwise
+378    */
+379   int mp_prime_is_prime(mp_int *a, int t, int *result);
+380   
+381   /* finds the next prime after the number "a" using "t" trials
+382    * of Miller-Rabin.
+383    */
+384   int mp_prime_next_prime(mp_int *a, int t);
+385   
+386   
+387   /* ---> radix conversion <--- */
+388   int mp_count_bits(mp_int *a);
+389   
+390   int mp_unsigned_bin_size(mp_int *a);
+391   int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+392   int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+393   
+394   int mp_signed_bin_size(mp_int *a);
+395   int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+396   int mp_to_signed_bin(mp_int *a, unsigned char *b);
+397   
+398   int mp_read_radix(mp_int *a, char *str, int radix);
+399   int mp_toradix(mp_int *a, char *str, int radix);
+400   int mp_radix_size(mp_int *a, int radix);
+401   
+402   int mp_fread(mp_int *a, int radix, FILE *stream);
+403   int mp_fwrite(mp_int *a, int radix, FILE *stream);
+404   
+405   #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
+406   #define mp_raw_size(mp)           mp_signed_bin_size(mp)
+407   #define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
+408   #define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len))
+409   #define mp_mag_size(mp)           mp_unsigned_bin_size(mp)
+410   #define mp_tomag(mp, str)         mp_to_unsigned_bin((mp), (str))
+411   
+412   #define mp_tobinary(M, S)  mp_toradix((M), (S), 2)
+413   #define mp_tooctal(M, S)   mp_toradix((M), (S), 8)
+414   #define mp_todecimal(M, S) mp_toradix((M), (S), 10)
+415   #define mp_tohex(M, S)     mp_toradix((M), (S), 16)
+416   
+417   /* lowlevel functions, do not call! */
+418   int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
+419   int s_mp_sub(mp_int *a, mp_int *b, mp_int *c);
+420   #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
+421   int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+422   int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+423   int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+424   int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
+425   int fast_s_mp_sqr(mp_int *a, mp_int *b);
+426   int s_mp_sqr(mp_int *a, mp_int *b);
+427   int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c);
+428   int mp_karatsuba_sqr(mp_int *a, mp_int *b);
+429   int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+430   int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+431   int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
+432   void bn_reverse(unsigned char *s, int len);
+433   
+434   #ifdef __cplusplus
+435      \}
+436   #endif
+437   
+438   #endif
+439   
+\end{alltt}
+\end{small}
+
+\end{document}
\ No newline at end of file