diff --git a/bn.pdf b/bn.pdf
index 7993ed6..615ff4e 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index 0174896..244bd6f 100644
--- a/bn.tex
+++ b/bn.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.34}
+\title{LibTomMath User Manual \\ v0.35}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
diff --git a/bn_fast_mp_invmod.c b/bn_fast_mp_invmod.c
index ff7a6a1..acc8364 100644
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@@ -42,7 +42,7 @@ int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
     goto LBL_ERR;
   }
 
diff --git a/bn_fast_s_mp_mul_digs.c b/bn_fast_s_mp_mul_digs.c
index ee83506..df3da26 100644
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@@ -62,7 +62,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -80,16 +80,16 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   }
 
   /* store final carry */
-  W[ix] = _W & MP_MASK;
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
diff --git a/bn_fast_s_mp_mul_high_digs.c b/bn_fast_s_mp_mul_high_digs.c
index ed45de3..ee657f9 100644
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@@ -71,7 +71,7 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   }
   
   /* store final carry */
-  W[ix] = _W & MP_MASK;
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
diff --git a/bn_fast_s_mp_sqr.c b/bn_fast_s_mp_sqr.c
index 04788cc..66a2942 100644
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@@ -15,33 +15,14 @@
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -76,7 +57,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -101,7 +82,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       }
 
       /* store it */
-      W[ix] = _W & MP_MASK;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
diff --git a/bn_mp_exteuclid.c b/bn_mp_exteuclid.c
index 545450b..c4ebab4 100644
--- a/bn_mp_exteuclid.c
+++ b/bn_mp_exteuclid.c
@@ -59,6 +59,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
diff --git a/bn_mp_invmod_slow.c b/bn_mp_invmod_slow.c
index c1884c0..c048655 100644
--- a/bn_mp_invmod_slow.c
+++ b/bn_mp_invmod_slow.c
@@ -33,8 +33,8 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto LBL_ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
     goto LBL_ERR;
diff --git a/bn_mp_montgomery_calc_normalization.c b/bn_mp_montgomery_calc_normalization.c
index 0a760cf..e2efc34 100644
--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@@ -28,7 +28,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
diff --git a/bn_mp_neg.c b/bn_mp_neg.c
index 3a991db..159cd74 100644
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@@ -19,12 +19,18 @@
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
   }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
   }
+
   return MP_OKAY;
 }
 #endif
diff --git a/bn_mp_radix_size.c b/bn_mp_radix_size.c
index 30b78d9..3d423ba 100644
--- a/bn_mp_radix_size.c
+++ b/bn_mp_radix_size.c
@@ -35,22 +35,29 @@ int mp_radix_size (mp_int * a, int radix, int *size)
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
   }
 
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
diff --git a/bn_mp_rand.c b/bn_mp_rand.c
index 1cc47f1..0dc7019 100644
--- a/bn_mp_rand.c
+++ b/bn_mp_rand.c
@@ -29,14 +29,14 @@ mp_rand (mp_int * a, int digits)
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
diff --git a/bn_mp_reduce.c b/bn_mp_reduce.c
index e84f81d..d746445 100644
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@@ -39,11 +39,11 @@ int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
diff --git a/bn_mp_toom_mul.c b/bn_mp_toom_mul.c
index 2d66779..125331b 100644
--- a/bn_mp_toom_mul.c
+++ b/bn_mp_toom_mul.c
@@ -17,9 +17,10 @@
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
diff --git a/bn_mp_xor.c b/bn_mp_xor.c
index 192aacc..de7e62c 100644
--- a/bn_mp_xor.c
+++ b/bn_mp_xor.c
@@ -37,7 +37,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
diff --git a/bn_mp_zero.c b/bn_mp_zero.c
index 0097598..c8d8907 100644
--- a/bn_mp_zero.c
+++ b/bn_mp_zero.c
@@ -16,11 +16,17 @@
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
+void mp_zero (mp_int * a)
 {
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
diff --git a/bn_s_mp_mul_digs.c b/bn_s_mp_mul_digs.c
index d9f0a56..b40ae2e 100644
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@@ -19,8 +19,7 @@
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
diff --git a/bn_s_mp_sqr.c b/bn_s_mp_sqr.c
index 4d12804..9cdb563 100644
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@@ -16,8 +16,7 @@
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
diff --git a/callgraph.txt b/callgraph.txt
index 031d168..2efcf24 100644
--- a/callgraph.txt
+++ b/callgraph.txt
@@ -907,7 +907,64 @@ BN_MP_EXPTMOD_C
 |   |   |   +--->BN_MP_CLEAR_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
-|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_SET_C
 |   |   |   +--->BN_MP_ZERO_C
 |   |   +--->BN_MP_DIV_2_C
@@ -938,6 +995,66 @@ BN_MP_EXPTMOD_C
 |   +--->BN_MP_INVMOD_SLOW_C
 |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_MOD_C
+|   |   |   +--->BN_MP_DIV_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
 |   |   +--->BN_MP_COPY_C
 |   |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_SET_C
@@ -1874,7 +1991,64 @@ BN_MP_PRIME_FERMAT_C
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_DIV_2_C
@@ -1904,6 +2078,66 @@ BN_MP_PRIME_FERMAT_C
 |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_SET_C
@@ -3111,7 +3345,65 @@ BN_MP_INVMOD_C
 |   |   +--->BN_MP_CLEAR_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
-|   +--->BN_MP_ABS_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_SET_C
 |   |   +--->BN_MP_ZERO_C
 |   +--->BN_MP_DIV_2_C
@@ -3143,6 +3435,67 @@ BN_MP_INVMOD_C
 |   +--->BN_MP_INIT_MULTI_C
 |   |   +--->BN_MP_INIT_C
 |   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_MOD_C
+|   |   +--->BN_MP_INIT_C
+|   |   +--->BN_MP_DIV_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_MP_COPY_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_ZERO_C
+|   |   |   +--->BN_MP_SET_C
+|   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_C
+|   |   |   +--->BN_MP_SUB_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_ADD_C
+|   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_DIV_2D_C
+|   |   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   +--->BN_MP_INIT_COPY_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_EXCH_C
 |   +--->BN_MP_COPY_C
 |   |   +--->BN_MP_GROW_C
 |   +--->BN_MP_SET_C
@@ -3195,7 +3548,65 @@ BN_FAST_MP_INVMOD_C
 |   +--->BN_MP_CLEAR_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
-+--->BN_MP_ABS_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_SET_C
 |   +--->BN_MP_ZERO_C
 +--->BN_MP_DIV_2_C
@@ -3683,7 +4094,55 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -3711,6 +4170,57 @@ BN_MP_PRIME_RANDOM_EX_C
 |   |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_DIV_2_C
@@ -5057,7 +5567,55 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -5085,6 +5643,57 @@ BN_MP_PRIME_IS_PRIME_C
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -6894,7 +7503,55 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_DIV_2_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   |   +--->BN_MP_CLAMP_C
@@ -6922,6 +7579,57 @@ BN_MP_PRIME_NEXT_PRIME_C
 |   |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_MOD_C
+|   |   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   |   +--->BN_MP_DIV_2_C
@@ -7898,6 +8606,67 @@ BN_MP_INVMOD_SLOW_C
 +--->BN_MP_INIT_MULTI_C
 |   +--->BN_MP_INIT_C
 |   +--->BN_MP_CLEAR_C
++--->BN_MP_MOD_C
+|   +--->BN_MP_INIT_C
+|   +--->BN_MP_DIV_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_MP_COPY_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   +--->BN_MP_ZERO_C
+|   |   +--->BN_MP_SET_C
+|   |   +--->BN_MP_COUNT_BITS_C
+|   |   +--->BN_MP_ABS_C
+|   |   +--->BN_MP_MUL_2D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_C
+|   |   +--->BN_MP_SUB_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_ADD_C
+|   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_DIV_2D_C
+|   |   |   +--->BN_MP_MOD_2D_C
+|   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_EXCH_C
+|   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   +--->BN_MP_CLEAR_C
+|   |   +--->BN_MP_INIT_SIZE_C
+|   |   +--->BN_MP_INIT_COPY_C
+|   |   +--->BN_MP_LSHD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_RSHD_C
+|   |   +--->BN_MP_MUL_D_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_CLEAR_C
+|   +--->BN_MP_ADD_C
+|   |   +--->BN_S_MP_ADD_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   |   +--->BN_MP_CMP_MAG_C
+|   |   +--->BN_S_MP_SUB_C
+|   |   |   +--->BN_MP_GROW_C
+|   |   |   +--->BN_MP_CLAMP_C
+|   +--->BN_MP_EXCH_C
 +--->BN_MP_COPY_C
 |   +--->BN_MP_GROW_C
 +--->BN_MP_SET_C
@@ -9817,6 +10586,7 @@ BN_MP_EXTEUCLID_C
 |   +--->BN_S_MP_SUB_C
 |   |   +--->BN_MP_GROW_C
 |   |   +--->BN_MP_CLAMP_C
++--->BN_MP_NEG_C
 +--->BN_MP_EXCH_C
 +--->BN_MP_CLEAR_MULTI_C
 |   +--->BN_MP_CLEAR_C
@@ -10024,7 +10794,56 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   |   |   +--->BN_MP_CLEAR_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
-|   |   |   +--->BN_MP_ABS_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_SET_C
 |   |   |   |   +--->BN_MP_ZERO_C
 |   |   |   +--->BN_MP_DIV_2_C
@@ -10054,6 +10873,58 @@ BN_MP_PRIME_MILLER_RABIN_C
 |   |   +--->BN_MP_INVMOD_SLOW_C
 |   |   |   +--->BN_MP_INIT_MULTI_C
 |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   +--->BN_MP_MOD_C
+|   |   |   |   +--->BN_MP_DIV_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_MP_COPY_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   +--->BN_MP_ZERO_C
+|   |   |   |   |   +--->BN_MP_SET_C
+|   |   |   |   |   +--->BN_MP_COUNT_BITS_C
+|   |   |   |   |   +--->BN_MP_ABS_C
+|   |   |   |   |   +--->BN_MP_MUL_2D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_C
+|   |   |   |   |   +--->BN_MP_SUB_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_EXCH_C
+|   |   |   |   |   +--->BN_MP_CLEAR_MULTI_C
+|   |   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   |   +--->BN_MP_INIT_SIZE_C
+|   |   |   |   |   +--->BN_MP_LSHD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_RSHD_C
+|   |   |   |   |   +--->BN_MP_MUL_D_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_CLEAR_C
+|   |   |   |   +--->BN_MP_ADD_C
+|   |   |   |   |   +--->BN_S_MP_ADD_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   |   +--->BN_MP_CMP_MAG_C
+|   |   |   |   |   +--->BN_S_MP_SUB_C
+|   |   |   |   |   |   +--->BN_MP_GROW_C
+|   |   |   |   |   |   +--->BN_MP_CLAMP_C
+|   |   |   |   +--->BN_MP_EXCH_C
 |   |   |   +--->BN_MP_COPY_C
 |   |   |   |   +--->BN_MP_GROW_C
 |   |   |   +--->BN_MP_SET_C
diff --git a/changes.txt b/changes.txt
index eb96813..99e40c1 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,14 @@
+March 12th, 2005
+v0.35  -- Stupid XOR function missing line again... oops.
+       -- Fixed bug in invmod not handling negative inputs correctly [Wolfgang Ehrhardt]
+       -- Made exteuclid always give positive u3 output...[ Wolfgang Ehrhardt ]
+       -- [Wolfgang Ehrhardt] Suggested a fix for mp_reduce() which avoided underruns.  ;-)
+       -- mp_rand() would emit one too many digits and it was possible to get a 0 out of it ... oops
+       -- Added montgomery to the testing to make sure it handles 1..10 digit moduli correctly
+       -- Fixed bug in comba that would lead to possible erroneous outputs when "pa < digs" 
+       -- Fixed bug in mp_toradix_size for "0" [Kevin Kenny]
+       -- Updated chapters 1-5 of the textbook ;-) It now talks about the new comba code!
+
 February 12th, 2005
 v0.34  -- Fixed two more small errors in mp_prime_random_ex()
        -- Fixed overflow in mp_mul_d() [Kevin Kenny]
diff --git a/demo/demo.c b/demo/demo.c
index 57604f9..0a6115a 100644
--- a/demo/demo.c
+++ b/demo/demo.c
@@ -56,6 +56,7 @@ int main(void)
       gcd_n, lcm_n, inv_n, div2_n, mul2_n, add_d_n, sub_d_n, t;
    unsigned rr;
    int i, n, err, cnt, ix, old_kara_m, old_kara_s;
+   mp_digit mp;
 
 
    mp_init(&a);
@@ -68,6 +69,40 @@ int main(void)
    srand(time(NULL));
 
 #if 0
+   // test montgomery 
+   printf("Testing montgomery...\n");
+   for (i = 1; i < 10; i++) {
+      printf("Testing digit size: %d\n", i);
+      for (n = 0; n < 1000; n++) {
+         mp_rand(&a, i);
+         a.dp[0] |= 1;
+
+         // let's see if R is right
+         mp_montgomery_calc_normalization(&b, &a);
+         mp_montgomery_setup(&a, &mp);
+
+         // now test a random reduction 
+         for (ix = 0; ix < 100; ix++) {
+             mp_rand(&c, 1 + abs(rand()) % (2*i));
+             mp_copy(&c, &d);
+             mp_copy(&c, &e);
+
+             mp_mod(&d, &a, &d);
+             mp_montgomery_reduce(&c, &a, mp);
+             mp_mulmod(&c, &b, &a, &c);
+
+             if (mp_cmp(&c, &d) != MP_EQ) { 
+printf("d = e mod a, c = e MOD a\n");
+mp_todecimal(&a, buf); printf("a = %s\n", buf);
+mp_todecimal(&e, buf); printf("e = %s\n", buf);
+mp_todecimal(&d, buf); printf("d = %s\n", buf);
+mp_todecimal(&c, buf); printf("c = %s\n", buf);
+printf("compare no compare!\n"); exit(EXIT_FAILURE); }
+         }
+      }
+   }
+   printf("done\n");
+
    // test mp_get_int
    printf("Testing: mp_get_int\n");
    for (i = 0; i < 1000; ++i) {
@@ -139,7 +174,7 @@ int main(void)
    printf("\n\n");
 
    /* test for size */
-   for (ix = 10; ix < 256; ix++) {
+   for (ix = 10; ix < 128; ix++) {
       printf("Testing (not safe-prime): %9d bits    \r", ix);
       fflush(stdout);
       err =
@@ -156,7 +191,7 @@ int main(void)
       }
    }
 
-   for (ix = 16; ix < 256; ix++) {
+   for (ix = 16; ix < 128; ix++) {
       printf("Testing (   safe-prime): %9d bits    \r", ix);
       fflush(stdout);
       err =
@@ -235,7 +270,7 @@ int main(void)
 	 mp_rand(&b, (cnt / DIGIT_BIT + 1) * 2);
 	 mp_copy(&c, &b);
 	 mp_mod(&c, &a, &c);
-	 mp_reduce_2k(&b, &a, 1);
+	 mp_reduce_2k(&b, &a, 2);
 	 if (mp_cmp(&c, &b)) {
 	    printf("FAILED\n");
 	    exit(0);
diff --git a/makefile b/makefile
index ad8a71a..17873ee 100644
--- a/makefile
+++ b/makefile
@@ -3,7 +3,7 @@
 #Tom St Denis
 
 #version of library 
-VERSION=0.34
+VERSION=0.35
 
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
 
diff --git a/makefile.shared b/makefile.shared
index 23aa33d..7c35881 100644
--- a/makefile.shared
+++ b/makefile.shared
@@ -1,7 +1,7 @@
 #Makefile for GCC
 #
 #Tom St Denis
-VERSION=0:34
+VERSION=0:35
 
 CC = libtool --mode=compile gcc
 CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare
diff --git a/poster.pdf b/poster.pdf
index 7354b4f..4c3e365 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index f2341a5..8ec8a10 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -90,7 +90,7 @@ int fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* we need y = |a| */
-  if ((res = mp_abs (a, &y)) != MP_OKAY) {
+  if ((res = mp_mod (a, b, &y)) != MP_OKAY) {
     goto LBL_ERR;
   }
 
@@ -430,7 +430,7 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
       tmpx = a->dp + tx;
       tmpy = b->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially 
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -448,16 +448,16 @@ int fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   }
 
   /* store final carry */
-  W[ix] = _W & MP_MASK;
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
-  c->used = digs;
+  c->used = pa;
 
   {
     register mp_digit *tmpc;
     tmpc = c->dp;
-    for (ix = 0; ix < digs; ix++) {
+    for (ix = 0; ix < pa+1; ix++) {
       /* now extract the previous digit [below the carry] */
       *tmpc++ = W[ix];
     }
@@ -548,7 +548,7 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
   }
   
   /* store final carry */
-  W[ix] = _W & MP_MASK;
+  W[ix] = (mp_digit)(_W & MP_MASK);
 
   /* setup dest */
   olduse  = c->used;
@@ -593,33 +593,14 @@ int fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
  */
 
-/* fast squaring
- *
- * This is the comba method where the columns of the product
- * are computed first then the carries are computed.  This
- * has the effect of making a very simple inner loop that
- * is executed the most
- *
- * W2 represents the outer products and W the inner.
- *
- * A further optimizations is made because the inner
- * products are of the form "A * B * 2".  The *2 part does
- * not need to be computed until the end which is good
- * because 64-bit shifts are slow!
- *
- * Based on Algorithm 14.16 on pp.597 of HAC.
- *
- */
 /* the jist of squaring...
-
-you do like mult except the offset of the tmpx [one that starts closer to zero]
-can't equal the offset of tmpy.  So basically you set up iy like before then you min it with
-(ty-tx) so that it never happens.  You double all those you add in the inner loop
+ * you do like mult except the offset of the tmpx [one that 
+ * starts closer to zero] can't equal the offset of tmpy.  
+ * So basically you set up iy like before then you min it with
+ * (ty-tx) so that it never happens.  You double all those 
+ * you add in the inner loop
 
 After that loop you do the squares and add them in.
-
-Remove W2 and don't memset W
-
 */
 
 int fast_s_mp_sqr (mp_int * a, mp_int * b)
@@ -654,7 +635,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       tmpx = a->dp + tx;
       tmpy = a->dp + ty;
 
-      /* this is the number of times the loop will iterrate, essentially its 
+      /* this is the number of times the loop will iterrate, essentially
          while (tx++ < a->used && ty-- >= 0) { ... }
        */
       iy = MIN(a->used-tx, ty+1);
@@ -679,7 +660,7 @@ int fast_s_mp_sqr (mp_int * a, mp_int * b)
       }
 
       /* store it */
-      W[ix] = _W & MP_MASK;
+      W[ix] = (mp_digit)(_W & MP_MASK);
 
       /* make next carry */
       W1 = _W >> ((mp_word)DIGIT_BIT);
@@ -2890,6 +2871,13 @@ int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
        if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
    }
 
+   /* make sure U3 >= 0 */
+   if (u3.sign == MP_NEG) {
+      mp_neg(&u1, &u1);
+      mp_neg(&u2, &u2);
+      mp_neg(&u3, &u3);
+   }
+
    /* copy result out */
    if (U1 != NULL) { mp_exch(U1, &u1); }
    if (U2 != NULL) { mp_exch(U2, &u2); }
@@ -3564,8 +3552,8 @@ int mp_invmod_slow (mp_int * a, mp_int * b, mp_int * c)
   }
 
   /* x = a, y = b */
-  if ((res = mp_copy (a, &x)) != MP_OKAY) {
-    goto LBL_ERR;
+  if ((res = mp_mod(a, b, &x)) != MP_OKAY) {
+      goto LBL_ERR;
   }
   if ((res = mp_copy (b, &y)) != MP_OKAY) {
     goto LBL_ERR;
@@ -4493,7 +4481,6 @@ int mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
   /* how many bits of last digit does b use */
   bits = mp_count_bits (b) % DIGIT_BIT;
 
-
   if (b->used > 1) {
      if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
         return res;
@@ -5206,12 +5193,18 @@ LBL_T1:mp_clear (&t1);
 int mp_neg (mp_int * a, mp_int * b)
 {
   int     res;
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (a != b) {
+     if ((res = mp_copy (a, b)) != MP_OKAY) {
+        return res;
+     }
   }
+
   if (mp_iszero(b) != MP_YES) {
      b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  } else {
+     b->sign = MP_ZPOS;
   }
+
   return MP_OKAY;
 }
 #endif
@@ -5953,22 +5946,29 @@ int mp_radix_size (mp_int * a, int radix, int *size)
     return MP_VAL;
   }
 
-  /* init a copy of the input */
-  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
-    return res;
+  if (mp_iszero(a) == MP_YES) {
+     *size = 2;
+    return MP_OKAY;
   }
 
   /* digs is the digit count */
   digs = 0;
 
   /* if it's negative add one for the sign */
-  if (t.sign == MP_NEG) {
+  if (a->sign == MP_NEG) {
     ++digs;
-    t.sign = MP_ZPOS;
   }
 
+  /* init a copy of the input */
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* force temp to positive */
+  t.sign = MP_ZPOS; 
+
   /* fetch out all of the digits */
-  while (mp_iszero (&t) == 0) {
+  while (mp_iszero (&t) == MP_NO) {
     if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
       mp_clear (&t);
       return res;
@@ -6042,14 +6042,14 @@ mp_rand (mp_int * a, int digits)
 
   /* first place a random non-zero digit */
   do {
-    d = ((mp_digit) abs (rand ()));
+    d = ((mp_digit) abs (rand ())) & MP_MASK;
   } while (d == 0);
 
   if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
     return res;
   }
 
-  while (digits-- > 0) {
+  while (--digits > 0) {
     if ((res = mp_lshd (a, 1)) != MP_OKAY) {
       return res;
     }
@@ -6287,11 +6287,11 @@ int mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
     }
   } else {
 #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-    if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) {
+    if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) {
       goto CLEANUP;
     }
 #else 
@@ -7433,9 +7433,10 @@ int mp_to_unsigned_bin_n (mp_int * a, unsigned char *b, unsigned long *outlen)
 
 /* multiplication using the Toom-Cook 3-way algorithm 
  *
- * Much more complicated than Karatsuba but has a lower asymptotic running time of 
- * O(N**1.464).  This algorithm is only particularly useful on VERY large
- * inputs (we're talking 1000s of digits here...).
+ * Much more complicated than Karatsuba but has a lower 
+ * asymptotic running time of O(N**1.464).  This algorithm is 
+ * only particularly useful on VERY large inputs 
+ * (we're talking 1000s of digits here...).
 */
 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
@@ -8154,7 +8155,7 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
   }
 
   for (ix = 0; ix < px; ix++) {
-
+     t.dp[ix] ^= x->dp[ix];
   }
   mp_clamp (&t);
   mp_exch (c, &t);
@@ -8184,12 +8185,18 @@ mp_xor (mp_int * a, mp_int * b, mp_int * c)
  */
 
 /* set to zero */
-void
-mp_zero (mp_int * a)
+void mp_zero (mp_int * a)
 {
+  int       n;
+  mp_digit *tmp;
+
   a->sign = MP_ZPOS;
   a->used = 0;
-  memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
+
+  tmp = a->dp;
+  for (n = 0; n < a->alloc; n++) {
+     *tmp++ = 0;
+  }
 }
 #endif
 
@@ -8679,8 +8686,7 @@ LBL_M:
  * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
  * many digits of output are created.
  */
-int
-s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
   mp_int  t;
   int     res, pa, pb, ix, iy;
@@ -8848,8 +8854,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  */
 
 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-int
-s_mp_sqr (mp_int * a, mp_int * b)
+int s_mp_sqr (mp_int * a, mp_int * b)
 {
   mp_int  t;
   int     res, ix, iy, pa;
diff --git a/tombc/grammar.txt b/tombc/grammar.txt
new file mode 100644
index 0000000..a780e75
--- /dev/null
+++ b/tombc/grammar.txt
@@ -0,0 +1,35 @@
+program       := program statement | statement | empty
+statement     := { statement }                                                                              | 
+                 identifier = numexpression;                                                                | 
+                 identifier[numexpression] = numexpression;                                                 |
+                 function(expressionlist);                                                                  | 
+                 for (identifer = numexpression; numexpression; identifier = numexpression) { statement }   |
+                 while (numexpression) { statement }                                                        | 
+                 if (numexpresion) { statement } elif                                                       | 
+                 break;                                                                                     | 
+                 continue;                                                                                  
+                 
+elif          := else statement | empty
+function      := abs | countbits | exptmod | jacobi | print | isprime | nextprime | issquare | readinteger | exit
+expressionlist := expressionlist, expression | expression
+
+// LR(1) !!!?
+expression    := string | numexpression
+numexpression := cmpexpr && cmpexpr | cmpexpr \|\| cmpexpr | cmpexpr
+cmpexpr       := boolexpr  < boolexpr | boolexpr  > boolexpr | boolexpr == boolexpr | 
+                 boolexpr <= boolexpr | boolexpr >= boolexpr | boolexpr
+boolexpr      := shiftexpr & shiftexpr | shiftexpr ^ shiftexpr | shiftexpr \| shiftexpr | shiftexpr
+shiftexpr     := addsubexpr << addsubexpr | addsubexpr >> addsubexpr | addsubexpr
+addsubexpr    := mulexpr + mulexpr | mulexpr - mulexpr | mulexpr
+mulexpr       := expr * expr       | expr / expr | expr % expr | expr
+expr          := -nexpr | nexpr 
+nexpr         := integer | identifier | ( numexpression ) | identifier[numexpression] 
+
+identifier    := identifer digits | identifier alpha | alpha
+alpha         := a ... z | A ... Z
+integer       := hexnumber | digits 
+hexnumber     := 0xhexdigits
+hexdigits     := hexdigits hexdigit | hexdigit
+hexdigit      := 0 ... 9 | a ... f | A ... F
+digits        := digits digit | digit 
+digit         := 0 ... 9
diff --git a/tommath.pdf b/tommath.pdf
index 310bbbd..c486d29 100644
Binary files a/tommath.pdf and b/tommath.pdf differ
diff --git a/tommath.src b/tommath.src
index 11bc1f6..7a53860 100644
--- a/tommath.src
+++ b/tommath.src
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.34 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -85,66 +85,32 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
 
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
 
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
 
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
 
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -937,7 +903,7 @@ assumed to contain undefined values they are initially set to zero.
 
 EXAM,bn_mp_grow.c
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @24,alloc@) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
@@ -1310,7 +1276,7 @@ After the function is completed, all of the digits are zeroed, the \textbf{used}
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1335,6 +1301,9 @@ logic to handle it.
 
 EXAM,bn_mp_abs.c
 
+This fairly trivial algorithm first eliminates non--required duplications (line @27,a != b@) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1368,11 +1337,15 @@ zero as negative.
 
 EXAM,bn_mp_neg.c
 
+Like mp\_abs() this function avoids non--required duplications (line @21,a != b@) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1397,11 +1370,14 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj
 
 EXAM,bn_mp_set.c
 
-Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line @21,mp_zero@) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line @22,MP_MASK@).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1503,10 +1479,12 @@ the zero'th digit.  If after all of the digits have been compared, no difference
 
 EXAM,bn_mp_cmp_mag.c
 
-The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines @24,if@ and @28,if@) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -1539,9 +1517,9 @@ $\vert a \vert < \vert b \vert$.  Step number four will compare the two when the
 
 EXAM,bn_mp_cmp.c
 
-The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
+The two if statements (lines @22,if@ and @26,if@) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line @30,if@) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line @31,mp_cmp_mag@).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -1664,19 +1642,21 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are
 
 EXAM,bn_s_mp_add.c
 
-Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines @27,if@ to @35,}@) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (@37,init@ to @42,}@) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
 lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
-begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
-Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line @65,u = 0@), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line @66,for@ to @75,}@) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line @81,for@ to @90,}@) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line @94,tmpc++@).  Note the ``++'' operator within the same expression.
+After line @94,tmpc++@, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line @97,for@ to @99,}@) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -1692,7 +1672,7 @@ this algorithm we will assume that the variable $\gamma$ represents the number o
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -1759,20 +1739,23 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop
 
 EXAM,bn_s_mp_sub.c
 
-Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
-$a$, $b$ and $c$ respectively.
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines @24,min@ and @25,max@).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines @42,tmpa@, @43,tmpb@ and @44,tmpc@) for $a$, $b$ and $c$ respectively.
 
-The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
+The first subtraction loop (lines @47,u = 0@ through @61,}@) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line @57, >>@).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
 
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+If $a$ has a larger magnitude than $b$ an additional loop (lines @64,for@ through @73,}@) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2098,10 +2081,11 @@ FIGU,sliding_window,Sliding Window Movement
 
 EXAM,bn_mp_lshd.c
 
-The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line @42,top@ is an alias
-for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line @24,if@) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line @42,top@) is an alias
+for the leading digit while $bottom$ (line @45,bottom@) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -2151,9 +2135,9 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf
 
 EXAM,bn_mp_rshd.c
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line @59,for (;@) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -2214,7 +2198,15 @@ complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm
 
 EXAM,bn_mp_mul_2d.c
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line @24,a != c@) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line @31,grow@) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines @45,if@ to @76,}@) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -2263,7 +2255,8 @@ ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -2306,7 +2299,13 @@ is copied to $b$, leading digits are removed and the remaining leading digit is
 
 EXAM,bn_mp_mod_2d.c
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line @41,for@).  Next we reduce the 
+leading digit of both (line @45,&=@) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -2464,33 +2463,46 @@ exceed the precision requested.
 
 EXAM,bn_s_mp_mul_digs.c
 
-Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+First we determine (line @30,if@) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
 
-Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line @36,init@) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line @48,MIN@) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line @68,tmpt@) and the carry propagated (line @71,>>@) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 MARK,COMBA
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
 
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
 
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -2584,38 +2596,32 @@ $256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which,
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -2625,15 +2631,24 @@ Zero excess digits. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
 
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
 
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -2643,20 +2658,20 @@ and addition operations in the nested loop in parallel.
 
 EXAM,bn_fast_s_mp_mul_digs.c
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+As per the pseudo--code we first calculate $pa$ (line @47,MIN@) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines @61,tmpx@, @62,tmpy@) to point
+inside the two multiplicands quickly.  
 
-The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
+The inner loop (lines @70,for@ to @72,}@) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
 
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines @75,W[ix]@, @78,>>@) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line @82,W[ix]@) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -2976,13 +2991,26 @@ result $a \cdot b$ is produced.
 
 EXAM,bn_mp_toom_mul.c
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines @40,mod@ to @69,rshd@) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines @72,mul@ and @77,mul@).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3065,7 +3093,7 @@ Column two of row one is a square and column three is the first unique column.
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -3121,9 +3149,14 @@ results calculated so far.  This involves expensive carry propagation which will
 
 EXAM,bn_s_mp_sqr.c
 
-Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@.  Line @42,>>@ extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively.  The doubling is performed using two
-additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line @32,for@) the square term is calculated on line @35,r =@.  The carry (line @42,>>@) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines @45,tmpx@ and @48,tmpt@) to simplify the inner loop.  The doubling is performed using two
+additions (line @57,r + r@) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -3135,9 +3168,9 @@ propagation operations from the inner loop.  However, the inner product must sti
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -3147,34 +3180,34 @@ moved to a $O(n)$ work level outside the $O(n^2)$ level.
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
 \\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -3183,24 +3216,24 @@ Double the products and propagate the carries simultaneously. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
 
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
 
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 EXAM,bn_fast_s_mp_sqr.c
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -3312,14 +3345,13 @@ By inlining the copy and shift operations the cutoff point for Karatsuba multipl
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -3362,12 +3394,9 @@ EXAM,bn_mp_sqr.c
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -3375,6 +3404,14 @@ $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -3394,7 +3431,7 @@ other forms of residues.
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -3610,7 +3647,7 @@ safe to do so.
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -5818,6 +5855,8 @@ To explain the Jacobi Symbol we shall first discuss the Legendre function\footno
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
diff --git a/tommath.tex b/tommath.tex
index 66b33ab..b016010 100644
--- a/tommath.tex
+++ b/tommath.tex
@@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
+\title{Multi--Precision Math}
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.34 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.35 release of the 
 LibTomMath project.
 
 \begin{alltt}
@@ -85,66 +85,32 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm
 
 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Draft Edition}
-I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
-contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
-own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
-ideas and concepts I had floating around in my head and then work on it afterwards refining a little bit at a time.  Chance
-would have it that I ended up with my summer off from Algonquin College and I was given four months solid to work on the
-text.  
+\chapter*{Prefaces}
+When I tell people about my LibTom projects and that I release them as public domain they are often puzzled.  
+They ask why I did it and especially why I continue to work on them for free.  The best I can explain it is ``Because I can.''  
+Which seems odd and perhaps too terse for adult conversation. I often qualify it with ``I am able, I am willing.'' which 
+perhaps explains it better.  I am the first to admit there is not anything that special with what I have done.  Perhaps
+others can see that too and then we would have a society to be proud of.  My LibTom projects are what I am doing to give 
+back to society in the form of tools and knowledge that can help others in their endeavours.
 
-Choosing to not waste any time I dove right into the project even before my spring semester was finished.  I wrote a bit
-off and on at first.  The moment my exams were finished I jumped into long 12 to 16 hour days.  The result after only
-a couple of months was a ten chapter, three hundred page draft that I quickly had distributed to anyone who wanted
-to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Crypto'03 in Santa Barbara.  So far I have
-managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
-rewarding.
+I started writing this book because it was the most logical task to further my goal of open academia.  The LibTomMath source
+code itself was written to be easy to follow and learn from.  There are times, however, where pure C source code does not
+explain the algorithms properly.  Hence this book.  The book literally starts with the foundation of the library and works
+itself outwards to the more complicated algorithms.  The use of both pseudo--code and verbatim source code provides a duality
+of ``theory'' and ``practice'' that the computer science students of the world shall appreciate.  I never deviate too far
+from relatively straightforward algebra and I hope that this book can be a valuable learning asset.
 
-Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
-Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
-finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
-Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
-onto finishing the book not securing a contract.
+This book and indeed much of the LibTom projects would not exist in their current form if it was not for a plethora
+of kind people donating their time, resources and kind words to help support my work.  Writing a text of significant
+length (along with the source code) is a tiresome and lengthy process.  Currently the LibTom project is four years old,
+comprises of literally thousands of users and over 100,000 lines of source code, TeX and other material.  People like Mads and Greg 
+were there at the beginning to encourage me to work well.  It is amazing how timely validation from others can boost morale to 
+continue the project. Definitely my parents were there for me by providing room and board during the many months of work in 2003.  
 
-So why am I writing this text?  It seems like a lot of work right?  Most certainly it is a lot of work writing a textbook.  
-Even the simplest introductory material has to be lined with references and figures.  A lot of the text has to be re-written
-from point form to prose form to ensure an easier read.  Why am I doing all this work for free then?  Simple. My philosophy
-is quite simply ``Open Source.  Open Academia.  Open Minds'' which means that to achieve a goal of open minds, that is,
-people willing to accept new ideas and explore the unknown you have to make available material they can access freely 
-without hinderance.  
+To my many friends whom I have met through the years I thank you for the good times and the words of encouragement.  I hope I
+honour your kind gestures with this project.
 
-I've been writing free software since I was about sixteen but only recently have I hit upon software that people have come
-to depend upon.  I started LibTomCrypt in December 2001 and now several major companies use it as integral portions of their
-software.  Several educational institutions use it as a matter of course and many freelance developers use it as
-part of their projects.  To further my contributions I started the LibTomMath project in December 2002 aimed at providing
-multiple precision arithmetic routines that students could learn from.  That is write routines that are not only easy
-to understand and follow but provide quite impressive performance considering they are all in standard portable ISO C.  
-
-The second leg of my philosophy is ``Open Academia'' which is where this textbook comes in.  In the end, when all is
-said and done the text will be useable by educational institutions as a reference on multiple precision arithmetic.  
-
-At this time I feel I should share a little information about myself.  The most common question I was asked at 
-Crypto'03, perhaps just out of professional courtesy, was which school I either taught at or attended.  The unfortunate
-truth is that I neither teach at or attend a school of academic reputation.  I'm currently at Algonquin College which 
-is what I'd like to call ``somewhat academic but mostly vocational'' college.  In otherwords, job training.
-
-I'm a 21 year old computer science student mostly self-taught in the areas I am aware of (which includes a half-dozen
-computer science fields, a few fields of mathematics and some English).  I look forward to teaching someday but I am
-still far off from that goal.  
-
-Now it would be improper for me to not introduce the rest of the texts co-authors.  While they are only contributing 
-corrections and editorial feedback their support has been tremendously helpful in presenting the concepts laid out
-in the text so far.  Greg has always been there for me.  He has tracked my LibTom projects since their inception and even
-sent cheques to help pay tuition from time to time.  His background has provided a wonderful source to bounce ideas off
-of and improve the quality of my writing.  Mads is another fellow who has just ``been there''.  I don't even recall what
-his interest in the LibTom projects is but I'm definitely glad he has been around.  His ability to catch logical errors
-in my written English have saved me on several occasions to say the least.
-
-What to expect next?  Well this is still a rough draft.  I've only had the chance to update a few chapters.  However, I've
-been getting the feeling that people are starting to use my text and I owe them some updated material.  My current tenative
-plan is to edit one chapter every two weeks starting January 4th.  It seems insane but my lower course load at college
-should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
-people who will take it.
+Open Source.  Open Academia.  Open Minds.
 
 \begin{flushright} Tom St Denis \end{flushright}
 
@@ -1045,7 +1011,7 @@ assumed to contain undefined values they are initially set to zero.
 \end{alltt}
 \end{small}
 
-A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 23) checks
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line 24) checks
 if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
 the function skips the re-allocation part thus saving time.
 
@@ -1590,14 +1556,20 @@ This algorithm simply resets a mp\_int to the default state.
 \begin{alltt}
 016   
 017   /* set to zero */
-018   void
-019   mp_zero (mp_int * a)
-020   \{
-021     a->sign = MP_ZPOS;
-022     a->used = 0;
-023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-024   \}
-025   #endif
+018   void mp_zero (mp_int * a)
+019   \{
+020     int       n;
+021     mp_digit *tmp;
+022   
+023     a->sign = MP_ZPOS;
+024     a->used = 0;
+025   
+026     tmp = a->dp;
+027     for (n = 0; n < a->alloc; n++) \{
+028        *tmp++ = 0;
+029     \}
+030   \}
+031   #endif
 \end{alltt}
 \end{small}
 
@@ -1609,7 +1581,7 @@ After the function is completed, all of the digits are zeroed, the \textbf{used}
 With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
 the absolute value of an mp\_int.
 
-\newpage\begin{figure}[here]
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_abs}. \\
@@ -1662,6 +1634,9 @@ logic to handle it.
 \end{alltt}
 \end{small}
 
+This fairly trivial algorithm first eliminates non--required duplications (line 27) and then sets the
+\textbf{sign} flag to \textbf{MP\_ZPOS}.
+
 \subsection{Integer Negation}
 With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
 the negative of an mp\_int input.
@@ -1702,23 +1677,33 @@ zero as negative.
 018   int mp_neg (mp_int * a, mp_int * b)
 019   \{
 020     int     res;
-021     if ((res = mp_copy (a, b)) != MP_OKAY) \{
-022       return res;
-023     \}
-024     if (mp_iszero(b) != MP_YES) \{
-025        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-026     \}
-027     return MP_OKAY;
-028   \}
-029   #endif
+021     if (a != b) \{
+022        if ((res = mp_copy (a, b)) != MP_OKAY) \{
+023           return res;
+024        \}
+025     \}
+026   
+027     if (mp_iszero(b) != MP_YES) \{
+028        b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+029     \} else \{
+030        b->sign = MP_ZPOS;
+031     \}
+032   
+033     return MP_OKAY;
+034   \}
+035   #endif
 \end{alltt}
 \end{small}
 
+Like mp\_abs() this function avoids non--required duplications (line 21) and then sets the sign.  We
+have to make sure that only non--zero values get a \textbf{sign} of \textbf{MP\_NEG}.  If the mp\_int is zero
+than the \textbf{sign} is hard--coded to \textbf{MP\_ZPOS}.
+
 \section{Small Constants}
 \subsection{Setting Small Constants}
 Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
 
-\begin{figure}[here]
+\newpage\begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_set}. \\
@@ -1757,11 +1742,14 @@ single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adj
 \end{alltt}
 \end{small}
 
-Line 20 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 21 copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line 22 will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
+First we zero (line 20) the mp\_int to make sure that the other members are initialized for a 
+small positive constant.  mp\_zero() ensures that the \textbf{sign} is positive and the \textbf{used} count
+is zero.  Next we set the digit and reduce it modulo $\beta$ (line 21).  After this step we have to 
+check if the resulting digit is zero or not.  If it is not then we set the \textbf{used} count to one, otherwise
+to zero.
+
+We can quickly reduce modulo $\beta$ since it is of the form $2^k$ and a quick binary AND operation with 
+$2^k - 1$ will perform the same operation.
 
 One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
 this function should take that into account.  Only trivially small constants can be set using this function.
@@ -1936,10 +1924,12 @@ the zero'th digit.  If after all of the digits have been compared, no difference
 \end{alltt}
 \end{small}
 
-The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
+The two if statements (lines 24 and 28) compare the number of digits in the two inputs.  These two are 
+performed before all of the digits are compared since it is a very cheap test to perform and can potentially save 
+considerable time.  The implementation given is also not valid without those two statements.  $b.alloc$ may be 
+smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the array of digits.
+
+
 
 \subsection{Signed Comparisons}
 Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
@@ -2000,9 +1990,9 @@ $\vert a \vert < \vert b \vert$.  Step number four will compare the two when the
 \end{alltt}
 \end{small}
 
-The two if statements on lines 22 and 23 perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
+The two if statements (lines 22 and 23) perform the initial sign comparison.  If the signs are not the equal then which ever
+has the positive sign is larger.   The inputs are compared (line 31) based on magnitudes.  If the signs were both 
+negative then the unsigned comparison is performed in the opposite direction (line 33).  Otherwise, the signs are assumed to 
 be both positive and a forward direction unsigned comparison is performed.
 
 \section*{Exercises}
@@ -2218,19 +2208,21 @@ The final carry is stored in $c_{max}$ and digits above $max$ upto $oldused$ are
 \end{alltt}
 \end{small}
 
-Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
-accomodate the result of the addition. 
+We first sort (lines 27 to 35) the inputs based on magnitude and determine the $min$ and $max$ variables.
+Note that $x$ is a pointer to an mp\_int assigned to the largest input, in effect it is a local alias.  Next we
+grow the destination (37 to 42) ensure that it can accomodate the result of the addition. 
 
 Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
 lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
 compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
 
-The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line 65 and ends on line 74.  Similarly the conditional addition loop
-begins on line 80 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 93.  
-Note the ``++'' operator on the same line.  After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines 96 to 99 which set any old upper digits to zero.
+The initial carry $u$ will be cleared (line 64), note that $u$ is of type mp\_digit which ensures type 
+compatibility within the implementation.  The initial addition (line 65 to 74) adds digits from
+both inputs until the smallest input runs out of digits.  Similarly the conditional addition loop
+(line 80 to 90) adds the remaining digits from the larger of the two inputs.  The addition is finished 
+with the final carry being stored in $tmpc$ (line 93).  Note the ``++'' operator within the same expression.
+After line 93, $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
+for the next loop (line 96 to 99) which set any old upper digits to zero.
 
 \subsection{Low Level Subtraction}
 The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
@@ -2245,7 +2237,7 @@ this algorithm we will assume that the variable $\gamma$ represents the number o
 mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
 
 For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
+data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma \ge 32$.
 
 \newpage\begin{figure}[!here]
 \begin{center}
@@ -2387,20 +2379,23 @@ If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and cop
 \end{alltt}
 \end{small}
 
-Line 24 and 25 perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines 41, 42 and 43 initialize the aliases for 
-$a$, $b$ and $c$ respectively.
+Like low level addition we ``sort'' the inputs.  Except in this case the sorting is hardcoded 
+(lines 24 and 25).  In reality the $min$ and $max$ variables are only aliases and are only 
+used to make the source code easier to read.  Again the pointer alias optimization is used 
+within this algorithm.  The aliases $tmpa$, $tmpb$ and $tmpc$ are initialized
+(lines 41, 42 and 43) for $a$, $b$ and $c$ respectively.
 
-The first subtraction loop occurs on lines 46 through 60.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line 56}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
+The first subtraction loop (lines 46 through 60) subtract digits from both inputs until the smaller of
+the two inputs has been exhausted.  As remarked earlier there is an implementation reason for using the ``awkward'' 
+method of extracting the carry (line 56).  The traditional method for extracting the carry would be to shift 
+by $lg(\beta)$ positions and logically AND the least significant bit.  The AND operation is required because all of 
+the bits above the $\lg(\beta)$'th bit will be set to one after a carry occurs from subtraction.  This carry 
+extraction requires two relatively cheap operations to extract the carry.  The other method is to simply shift the 
+most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This 
+optimization only works on twos compliment machines which is a safe assumption to make.
 
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
+If $a$ has a larger magnitude than $b$ an additional loop (lines 63 through 72) is required to propagate 
+the carry through $a$ and copy the result to $c$.  
 
 \subsection{High Level Addition}
 Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
@@ -2985,10 +2980,11 @@ step 8 sets the lower $b$ digits to zero.
 \end{alltt}
 \end{small}
 
-The if statement on line 23 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line 41 is an alias
-for the leading digit while $bottom$ on line 44 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
+The if statement (line 23) ensures that the $b$ variable is greater than zero since we do not interpret negative
+shift counts properly.  The \textbf{used} count is incremented by $b$ before the copy loop begins.  This elminates 
+the need for an additional variable in the for loop.  The variable $top$ (line 41) is an alias
+for the leading digit while $bottom$ (line 44) is an alias for the trailing edge.  The aliases form a 
+window of exactly $b$ digits over the input.  
 
 \subsection{Division by $x$}
 
@@ -3095,9 +3091,9 @@ Once the window copy is complete the upper digits must be zeroed and the \textbf
 \end{alltt}
 \end{small}
 
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
+The only noteworthy element of this routine is the lack of a return type since it cannot fail.  Like mp\_lshd() we
+form a sliding window except we copy in the other direction.  After the window (line 59) we then zero
+the upper digits of the input to make sure the result is correct.
 
 \section{Powers of Two}
 
@@ -3228,7 +3224,15 @@ complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm
 \end{alltt}
 \end{small}
 
-Notes to be revised when code is updated. -- Tom
+The shifting is performed in--place which means the first step (line 24) is to copy the input to the 
+destination.  We avoid calling mp\_copy() by making sure the mp\_ints are different.  The destination then
+has to be grown (line 31) to accomodate the result.
+
+If the shift count $b$ is larger than $lg(\beta)$ then a call to mp\_lshd() is used to handle all of the multiples 
+of $lg(\beta)$.  Leaving only a remaining shift of $lg(\beta) - 1$ or fewer bits left.  Inside the actual shift 
+loop (lines 45 to 76) we make use of pre--computed values $shift$ and $mask$.   These are used to
+extract the carry bit(s) to pass into the next iteration of the loop.  The $r$ and $rr$ variables form a 
+chain between consecutive iterations to propagate the carry.  
 
 \subsection{Division by Power of Two}
 
@@ -3361,7 +3365,8 @@ ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The
 result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
 the quotient is obtained.
 
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
+The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  The only significant difference is
+the direction of the shifts.
 
 \subsection{Remainder of Division by Power of Two}
 
@@ -3446,7 +3451,13 @@ is copied to $b$, leading digits are removed and the remaining leading digit is
 \end{alltt}
 \end{small}
 
--- Add comments later, Tom.
+We first avoid cases of $b \le 0$ by simply mp\_zero()'ing the destination in such cases.  Next if $2^b$ is larger
+than the input we just mp\_copy() the input and return right away.  After this point we know we must actually
+perform some work to produce the remainder.
+
+Recalling that reducing modulo $2^k$ and a binary ``and'' with $2^k - 1$ are numerically equivalent we can quickly reduce 
+the number.  First we zero any digits above the last digit in $2^b$ (line 41).  Next we reduce the 
+leading digit of both (line 45) and then mp\_clamp().
 
 \section*{Exercises}
 \begin{tabular}{cl}
@@ -3611,101 +3622,113 @@ exceed the precision requested.
 018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
 019    * many digits of output are created.
 020    */
-021   int
-022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-023   \{
-024     mp_int  t;
-025     int     res, pa, pb, ix, iy;
-026     mp_digit u;
-027     mp_word r;
-028     mp_digit tmpx, *tmpt, *tmpy;
-029   
-030     /* can we use the fast multiplier? */
-031     if (((digs) < MP_WARRAY) &&
-032         MIN (a->used, b->used) < 
-033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_s_mp_mul_digs (a, b, c, digs);
-035     \}
-036   
-037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
-038       return res;
-039     \}
-040     t.used = digs;
-041   
-042     /* compute the digits of the product directly */
-043     pa = a->used;
-044     for (ix = 0; ix < pa; ix++) \{
-045       /* set the carry to zero */
-046       u = 0;
-047   
-048       /* limit ourselves to making digs digits of output */
-049       pb = MIN (b->used, digs - ix);
-050   
-051       /* setup some aliases */
-052       /* copy of the digit from a used within the nested loop */
-053       tmpx = a->dp[ix];
-054       
-055       /* an alias for the destination shifted ix places */
-056       tmpt = t.dp + ix;
-057       
-058       /* an alias for the digits of b */
-059       tmpy = b->dp;
-060   
-061       /* compute the columns of the output and propagate the carry */
-062       for (iy = 0; iy < pb; iy++) \{
-063         /* compute the column as a mp_word */
-064         r       = ((mp_word)*tmpt) +
-065                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-066                   ((mp_word) u);
-067   
-068         /* the new column is the lower part of the result */
-069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-070   
-071         /* get the carry word from the result */
-072         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-073       \}
-074       /* set carry if it is placed below digs */
-075       if (ix + iy < digs) \{
-076         *tmpt = u;
-077       \}
-078     \}
-079   
-080     mp_clamp (&t);
-081     mp_exch (&t, c);
-082   
-083     mp_clear (&t);
-084     return MP_OKAY;
-085   \}
-086   #endif
+021   int s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
+022   \{
+023     mp_int  t;
+024     int     res, pa, pb, ix, iy;
+025     mp_digit u;
+026     mp_word r;
+027     mp_digit tmpx, *tmpt, *tmpy;
+028   
+029     /* can we use the fast multiplier? */
+030     if (((digs) < MP_WARRAY) &&
+031         MIN (a->used, b->used) < 
+032             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
+033       return fast_s_mp_mul_digs (a, b, c, digs);
+034     \}
+035   
+036     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
+037       return res;
+038     \}
+039     t.used = digs;
+040   
+041     /* compute the digits of the product directly */
+042     pa = a->used;
+043     for (ix = 0; ix < pa; ix++) \{
+044       /* set the carry to zero */
+045       u = 0;
+046   
+047       /* limit ourselves to making digs digits of output */
+048       pb = MIN (b->used, digs - ix);
+049   
+050       /* setup some aliases */
+051       /* copy of the digit from a used within the nested loop */
+052       tmpx = a->dp[ix];
+053       
+054       /* an alias for the destination shifted ix places */
+055       tmpt = t.dp + ix;
+056       
+057       /* an alias for the digits of b */
+058       tmpy = b->dp;
+059   
+060       /* compute the columns of the output and propagate the carry */
+061       for (iy = 0; iy < pb; iy++) \{
+062         /* compute the column as a mp_word */
+063         r       = ((mp_word)*tmpt) +
+064                   ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+065                   ((mp_word) u);
+066   
+067         /* the new column is the lower part of the result */
+068         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+069   
+070         /* get the carry word from the result */
+071         u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+072       \}
+073       /* set carry if it is placed below digs */
+074       if (ix + iy < digs) \{
+075         *tmpt = u;
+076       \}
+077     \}
+078   
+079     mp_clamp (&t);
+080     mp_exch (&t, c);
+081   
+082     mp_clear (&t);
+083     return MP_OKAY;
+084   \}
+085   #endif
 \end{alltt}
 \end{small}
 
-Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
+First we determine (line 30) if the Comba method can be used first since it's faster.  The conditions for 
+sing the Comba routine are that min$(a.used, b.used) < \delta$ and the number of digits of output is less than 
+\textbf{MP\_WARRAY}.  This new constant is used to control the stack usage in the Comba routines.  By default it is 
+set to $\delta$ but can be reduced when memory is at a premium.
 
-Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line 65 makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
+If we cannot use the Comba method we proceed to setup the baseline routine.  We allocate the the destination mp\_int
+$t$ (line 36) to the exact size of the output to avoid further re--allocations.  At this point we now 
+begin the $O(n^2)$ loop.
+
+This implementation of multiplication has the caveat that it can be trimmed to only produce a variable number of
+digits as output.  In each iteration of the outer loop the $pb$ variable is set (line 48) to the maximum 
+number of inner loop iterations.  
+
+Inside the inner loop we calculate $\hat r$ as the mp\_word product of the two mp\_digits and the addition of the
+carry from the previous iteration.  A particularly important observation is that most modern optimizing 
+C compilers (GCC for instance) can recognize that a $N \times N \rightarrow 2N$ multiplication is all that 
+is required for the product.  In x86 terms for example, this means using the MUL instruction.
+
+Each digit of the product is stored in turn (line 68) and the carry propagated (line 71) to the 
+next iteration.
 
 \subsection{Faster Multiplication by the ``Comba'' Method}
 
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
+One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be 
+computed and propagated upwards.  This makes the nested loop very sequential and hard to unroll and implement 
+in parallel.  The ``Comba'' \cite{COMBA} method is named after little known (\textit{in cryptographic venues}) Paul G. 
+Comba who described a method of implementing fast multipliers that do not require nested carry fixup operations.  As an 
+interesting aside it seems that Paul Barrett describes a similar technique in his 1986 paper \cite{BARRETT} written 
+five years before.
 
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
+At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight 
+twist is placed on how the columns of the result are produced.  In the standard long-hand algorithm rows of products 
+are produced then added together to form the final result.  In the baseline algorithm the columns are added together 
+after each iteration to get the result instantaneously.  
 
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
+In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at 
+the $O(n^2)$ level a simple multiplication and addition step is performed.  The carries of the columns are propagated 
+after the nested loop to reduce the amount of work requiored. Succintly the first step of the algorithm is to compute 
+the product vector $\vec x$ as follows. 
 
 \begin{equation}
 \vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
@@ -3799,38 +3822,32 @@ $256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which,
 \textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
 \textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
 \hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} single precision digits named $W$ on the stack. \\
 1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
 2.  If step 1 failed return(\textit{MP\_MEM}).\\
 \\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
+3.  $pa \leftarrow \mbox{MIN}(digs, a.used + b.used)$ \\
 \\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
+4.  $\_ \hat W \leftarrow 0$ \\
+5.  for $ix$ from 0 to $pa - 1$ do \\
+\hspace{3mm}5.1  $ty \leftarrow \mbox{MIN}(b.used - 1, ix)$ \\
+\hspace{3mm}5.2  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.3  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.4  for $iz$ from 0 to $iy - 1$ do \\
+\hspace{6mm}5.4.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx+iy}b_{ty-iy}$ \\
+\hspace{3mm}5.5  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$\\
+\hspace{3mm}5.6  $\_ \hat W \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
+6.  $W_{pa} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
 \\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
+7.  $oldused \leftarrow c.used$ \\
+8.  $c.used \leftarrow digs$ \\
+9.  for $ix$ from $0$ to $pa$ do \\
+\hspace{3mm}9.1  $c_{ix} \leftarrow W_{ix}$ \\
+10.  for $ix$ from $pa + 1$ to $oldused - 1$ do \\
+\hspace{3mm}10.1 $c_{ix} \leftarrow 0$ \\
 \\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
+11.  Clamp $c$. \\
+12.  Return MP\_OKAY. \\
 \hline
 \end{tabular}
 \end{center}
@@ -3840,15 +3857,24 @@ Zero excess digits. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
+This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.
 
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
+The outer loop of this algorithm is more complicated than that of the baseline multiplier.  This is because on the inside of the 
+loop we want to produce one column per pass.  This allows the accumulator $\_ \hat W$ to be placed in CPU registers and
+reduce the memory bandwidth to two \textbf{mp\_digit} reads per iteration.
 
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
+The $ty$ variable is set to the minimum count of $ix$ or the number of digits in $b$.  That way if $a$ has more digits than
+$b$ this will be limited to $b.used - 1$.  The $tx$ variable is set to the to the distance past $b.used$ the variable
+$ix$ is.  This is used for the immediately subsequent statement where we find $iy$.  
+
+The variable $iy$ is the minimum digits we can read from either $a$ or $b$ before running out.  Computing one column at a time
+means we have to scan one integer upwards and the other downwards.  $a$ starts at $tx$ and $b$ starts at $ty$.  In each
+pass we are producing the $ix$'th output column and we note that $tx + ty = ix$.  As we move $tx$ upwards we have to 
+move $ty$ downards so the equality remains valid.  The $iy$ variable is the number of iterations until 
+$tx \ge a.used$ or $ty < 0$ occurs.
+
+After every inner pass we store the lower half of the accumulator into $W_{ix}$ and then propagate the carry of the accumulator
+into the next round by dividing $\_ \hat W$ by $\beta$.
 
 To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
 cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
@@ -3908,8 +3934,7 @@ and addition operations in the nested loop in parallel.
 061         tmpx = a->dp + tx;
 062         tmpy = b->dp + ty;
 063   
-064         /* this is the number of times the loop will iterrate, essentially its
-       
+064         /* this is the number of times the loop will iterrate, essentially 
 065            while (tx++ < a->used && ty-- >= 0) \{ ... \}
 066          */
 067         iy = MIN(a->used-tx, ty+1);
@@ -3927,16 +3952,16 @@ and addition operations in the nested loop in parallel.
 079     \}
 080   
 081     /* store final carry */
-082     W[ix] = _W & MP_MASK;
+082     W[ix] = (mp_digit)(_W & MP_MASK);
 083   
 084     /* setup dest */
 085     olduse  = c->used;
-086     c->used = digs;
+086     c->used = pa;
 087   
 088     \{
 089       register mp_digit *tmpc;
 090       tmpc = c->dp;
-091       for (ix = 0; ix < digs; ix++) \{
+091       for (ix = 0; ix < pa+1; ix++) \{
 092         /* now extract the previous digit [below the carry] */
 093         *tmpc++ = W[ix];
 094       \}
@@ -3953,20 +3978,20 @@ and addition operations in the nested loop in parallel.
 \end{alltt}
 \end{small}
 
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines 61, 62 and 75}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
+As per the pseudo--code we first calculate $pa$ (line 47) as the number of digits to output.  Next we begin the outer loop
+to produce the individual columns of the product.  We use the two aliases $tmpx$ and $tmpy$ (lines 61, 62) to point
+inside the two multiplicands quickly.  
 
-The inner loop on lines 91, 78 and 79 is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
+The inner loop (lines 70 to 72) of this implementation is where the tradeoff come into play.  Originally this comba 
+implementation was ``row--major'' which means it adds to each of the columns in each pass.  After the outer loop it would then fix 
+the carries.  This was very fast except it had an annoying drawback.  You had to read a mp\_word and two mp\_digits and write 
+one mp\_word per iteration.  On processors such as the Athlon XP and P4 this did not matter much since the cache bandwidth 
+is very high and it can keep the ALU fed with data.  It did, however, matter on older and embedded cpus where cache is often 
+slower and also often doesn't exist.  This new algorithm only performs two reads per iteration under the assumption that the 
+compiler has aliased $\_ \hat W$ to a CPU register.
 
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
+After the inner loop we store the current accumulator in $W$ and shift $\_ \hat W$ (lines 75, 78) to forward it as 
+a carry for the next pass.  After the outer loop we use the final carry (line 82) as the last digit of the product.  
 
 \subsection{Polynomial Basis Multiplication}
 To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
@@ -4443,277 +4468,290 @@ result $a \cdot b$ is produced.
 016   
 017   /* multiplication using the Toom-Cook 3-way algorithm 
 018    *
-019    * Much more complicated than Karatsuba but has a lower asymptotic running t
-      ime of 
-020    * O(N**1.464).  This algorithm is only particularly useful on VERY large
-021    * inputs (we're talking 1000s of digits here...).
-022   */
-023   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-024   \{
-025       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-026       int res, B;
-027           
-028       /* init temps */
-029       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
-030                                &a0, &a1, &a2, &b0, &b1, 
-031                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
-032          return res;
-033       \}
-034       
-035       /* B */
-036       B = MIN(a->used, b->used) / 3;
-037       
-038       /* a = a2 * B**2 + a1 * B + a0 */
-039       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
-040          goto ERR;
-041       \}
-042   
-043       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
-044          goto ERR;
-045       \}
-046       mp_rshd(&a1, B);
-047       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-048   
-049       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
-050          goto ERR;
-051       \}
-052       mp_rshd(&a2, B*2);
-053       
-054       /* b = b2 * B**2 + b1 * B + b0 */
-055       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
-056          goto ERR;
-057       \}
-058   
-059       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
-060          goto ERR;
-061       \}
-062       mp_rshd(&b1, B);
-063       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-064   
-065       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
-066          goto ERR;
-067       \}
-068       mp_rshd(&b2, B*2);
-069       
-070       /* w0 = a0*b0 */
-071       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
-072          goto ERR;
-073       \}
-074       
-075       /* w4 = a2 * b2 */
-076       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
-077          goto ERR;
-078       \}
-079       
-080       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-081       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
-082          goto ERR;
-083       \}
-084       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-085          goto ERR;
-086       \}
-087       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-088          goto ERR;
-089       \}
-090       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
-091          goto ERR;
-092       \}
-093       
-094       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
-095          goto ERR;
-096       \}
-097       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-098          goto ERR;
-099       \}
-100       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-101          goto ERR;
-102       \}
-103       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
-104          goto ERR;
-105       \}
-106       
-107       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
-108          goto ERR;
-109       \}
-110       
-111       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-112       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
-113          goto ERR;
-114       \}
-115       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-116          goto ERR;
-117       \}
-118       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-119          goto ERR;
-120       \}
-121       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-122          goto ERR;
-123       \}
-124       
-125       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
-126          goto ERR;
-127       \}
-128       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-129          goto ERR;
-130       \}
-131       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-132          goto ERR;
-133       \}
-134       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-135          goto ERR;
-136       \}
-137       
-138       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
-139          goto ERR;
-140       \}
-141       
-142   
-143       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-144       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
-145          goto ERR;
-146       \}
-147       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-148          goto ERR;
-149       \}
-150       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
-151          goto ERR;
-152       \}
-153       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-154          goto ERR;
-155       \}
-156       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
-157          goto ERR;
-158       \}
-159       
-160       /* now solve the matrix 
-161       
-162          0  0  0  0  1
-163          1  2  4  8  16
-164          1  1  1  1  1
-165          16 8  4  2  1
-166          1  0  0  0  0
-167          
-168          using 12 subtractions, 4 shifts, 
-169                 2 small divisions and 1 small multiplication 
-170        */
-171        
-172        /* r1 - r4 */
-173        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
-174           goto ERR;
-175        \}
-176        /* r3 - r0 */
-177        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
-178           goto ERR;
-179        \}
-180        /* r1/2 */
-181        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
-182           goto ERR;
-183        \}
-184        /* r3/2 */
-185        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
-186           goto ERR;
-187        \}
-188        /* r2 - r0 - r4 */
-189        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
-190           goto ERR;
-191        \}
-192        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
-193           goto ERR;
-194        \}
-195        /* r1 - r2 */
-196        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-197           goto ERR;
-198        \}
-199        /* r3 - r2 */
-200        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-201           goto ERR;
-202        \}
-203        /* r1 - 8r0 */
-204        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
-205           goto ERR;
-206        \}
-207        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
-208           goto ERR;
-209        \}
-210        /* r3 - 8r4 */
-211        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
-212           goto ERR;
-213        \}
-214        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
-215           goto ERR;
-216        \}
-217        /* 3r2 - r1 - r3 */
-218        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
-219           goto ERR;
-220        \}
-221        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
-222           goto ERR;
-223        \}
-224        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
-225           goto ERR;
-226        \}
-227        /* r1 - r2 */
-228        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-229           goto ERR;
-230        \}
-231        /* r3 - r2 */
-232        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-233           goto ERR;
-234        \}
-235        /* r1/3 */
-236        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
-237           goto ERR;
-238        \}
-239        /* r3/3 */
-240        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
-241           goto ERR;
-242        \}
-243        
-244        /* at this point shift W[n] by B*n */
-245        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
-246           goto ERR;
-247        \}
-248        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
-249           goto ERR;
-250        \}
-251        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
-252           goto ERR;
-253        \}
-254        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
-255           goto ERR;
-256        \}     
-257        
-258        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
-259           goto ERR;
-260        \}
-261        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
-262           goto ERR;
-263        \}
-264        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
-265           goto ERR;
-266        \}
-267        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
-268           goto ERR;
-269        \}     
-270        
-271   ERR:
-272        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
-273                       &a0, &a1, &a2, &b0, &b1, 
-274                       &b2, &tmp1, &tmp2, NULL);
-275        return res;
-276   \}     
-277        
-278   #endif
+019    * Much more complicated than Karatsuba but has a lower 
+020    * asymptotic running time of O(N**1.464).  This algorithm is 
+021    * only particularly useful on VERY large inputs 
+022    * (we're talking 1000s of digits here...).
+023   */
+024   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+025   \{
+026       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
+027       int res, B;
+028           
+029       /* init temps */
+030       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
+031                                &a0, &a1, &a2, &b0, &b1, 
+032                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
+033          return res;
+034       \}
+035       
+036       /* B */
+037       B = MIN(a->used, b->used) / 3;
+038       
+039       /* a = a2 * B**2 + a1 * B + a0 */
+040       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
+041          goto ERR;
+042       \}
+043   
+044       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
+045          goto ERR;
+046       \}
+047       mp_rshd(&a1, B);
+048       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
+049   
+050       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
+051          goto ERR;
+052       \}
+053       mp_rshd(&a2, B*2);
+054       
+055       /* b = b2 * B**2 + b1 * B + b0 */
+056       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
+057          goto ERR;
+058       \}
+059   
+060       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
+061          goto ERR;
+062       \}
+063       mp_rshd(&b1, B);
+064       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
+065   
+066       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
+067          goto ERR;
+068       \}
+069       mp_rshd(&b2, B*2);
+070       
+071       /* w0 = a0*b0 */
+072       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
+073          goto ERR;
+074       \}
+075       
+076       /* w4 = a2 * b2 */
+077       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
+078          goto ERR;
+079       \}
+080       
+081       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
+082       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
+083          goto ERR;
+084       \}
+085       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+086          goto ERR;
+087       \}
+088       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+089          goto ERR;
+090       \}
+091       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
+092          goto ERR;
+093       \}
+094       
+095       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
+096          goto ERR;
+097       \}
+098       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+099          goto ERR;
+100       \}
+101       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+102          goto ERR;
+103       \}
+104       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
+105          goto ERR;
+106       \}
+107       
+108       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
+109          goto ERR;
+110       \}
+111       
+112       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
+113       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
+114          goto ERR;
+115       \}
+116       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
+117          goto ERR;
+118       \}
+119       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
+120          goto ERR;
+121       \}
+122       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+123          goto ERR;
+124       \}
+125       
+126       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
+127          goto ERR;
+128       \}
+129       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
+130          goto ERR;
+131       \}
+132       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
+133          goto ERR;
+134       \}
+135       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+136          goto ERR;
+137       \}
+138       
+139       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
+140          goto ERR;
+141       \}
+142       
+143   
+144       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
+145       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
+146          goto ERR;
+147       \}
+148       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
+149          goto ERR;
+150       \}
+151       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
+152          goto ERR;
+153       \}
+154       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
+155          goto ERR;
+156       \}
+157       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
+158          goto ERR;
+159       \}
+160       
+161       /* now solve the matrix 
+162       
+163          0  0  0  0  1
+164          1  2  4  8  16
+165          1  1  1  1  1
+166          16 8  4  2  1
+167          1  0  0  0  0
+168          
+169          using 12 subtractions, 4 shifts, 
+170                 2 small divisions and 1 small multiplication 
+171        */
+172        
+173        /* r1 - r4 */
+174        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
+175           goto ERR;
+176        \}
+177        /* r3 - r0 */
+178        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
+179           goto ERR;
+180        \}
+181        /* r1/2 */
+182        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
+183           goto ERR;
+184        \}
+185        /* r3/2 */
+186        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
+187           goto ERR;
+188        \}
+189        /* r2 - r0 - r4 */
+190        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
+191           goto ERR;
+192        \}
+193        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
+194           goto ERR;
+195        \}
+196        /* r1 - r2 */
+197        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+198           goto ERR;
+199        \}
+200        /* r3 - r2 */
+201        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+202           goto ERR;
+203        \}
+204        /* r1 - 8r0 */
+205        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
+206           goto ERR;
+207        \}
+208        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
+209           goto ERR;
+210        \}
+211        /* r3 - 8r4 */
+212        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
+213           goto ERR;
+214        \}
+215        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
+216           goto ERR;
+217        \}
+218        /* 3r2 - r1 - r3 */
+219        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
+220           goto ERR;
+221        \}
+222        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
+223           goto ERR;
+224        \}
+225        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
+226           goto ERR;
+227        \}
+228        /* r1 - r2 */
+229        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
+230           goto ERR;
+231        \}
+232        /* r3 - r2 */
+233        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
+234           goto ERR;
+235        \}
+236        /* r1/3 */
+237        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
+238           goto ERR;
+239        \}
+240        /* r3/3 */
+241        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
+242           goto ERR;
+243        \}
+244        
+245        /* at this point shift W[n] by B*n */
+246        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
+247           goto ERR;
+248        \}
+249        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
+250           goto ERR;
+251        \}
+252        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
+253           goto ERR;
+254        \}
+255        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
+256           goto ERR;
+257        \}     
+258        
+259        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
+260           goto ERR;
+261        \}
+262        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
+263           goto ERR;
+264        \}
+265        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
+266           goto ERR;
+267        \}
+268        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
+269           goto ERR;
+270        \}     
+271        
+272   ERR:
+273        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
+274                       &a0, &a1, &a2, &b0, &b1, 
+275                       &b2, &tmp1, &tmp2, NULL);
+276        return res;
+277   \}     
+278        
+279   #endif
 \end{alltt}
 \end{small}
 
--- Comments to be added during editing phase.
+The first obvious thing to note is that this algorithm is complicated.  The complexity is worth it if you are multiplying very 
+large numbers.  For example, a 10,000 digit multiplication takes approximaly 99,282,205 fewer single precision multiplications with
+Toom--Cook than a Comba or baseline approach (this is a savings of more than 99$\%$).  For most ``crypto'' sized numbers this
+algorithm is not practical as Karatsuba has a much lower cutoff point.
+
+First we split $a$ and $b$ into three roughly equal portions.  This has been accomplished (lines 40 to 69) with 
+combinations of mp\_rshd() and mp\_mod\_2d() function calls.  At this point $a = a2 \cdot \beta^2 + a1 \cdot \beta + a0$ and similiarly
+for $b$.  
+
+Next we compute the five points $w0, w1, w2, w3$ and $w4$.  Recall that $w0$ and $w4$ can be computed directly from the portions so
+we get those out of the way first (lines 72 and 77).  Next we compute $w1, w2$ and $w3$ using Horners method.
+
+After this point we solve for the actual values of $w1, w2$ and $w3$ by reducing the $5 \times 5$ system which is relatively
+straight forward.  
 
 \subsection{Signed Multiplication}
 Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
 of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4846,7 +4884,7 @@ Column two of row one is a square and column three is the first unique column.
 The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
 will not handle.  
 
-\newpage\begin{figure}[!here]
+\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -4906,75 +4944,79 @@ results calculated so far.  This involves expensive carry propagation which will
 \begin{alltt}
 016   
 017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-018   int
-019   s_mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     mp_int  t;
-022     int     res, ix, iy, pa;
-023     mp_word r;
-024     mp_digit u, tmpx, *tmpt;
-025   
-026     pa = a->used;
-027     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
-028       return res;
-029     \}
-030   
-031     /* default used is maximum possible size */
-032     t.used = 2*pa + 1;
-033   
-034     for (ix = 0; ix < pa; ix++) \{
-035       /* first calculate the digit at 2*ix */
-036       /* calculate double precision result */
-037       r = ((mp_word) t.dp[2*ix]) +
-038           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
-039   
-040       /* store lower part in result */
-041       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-042   
-043       /* get the carry */
-044       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-045   
-046       /* left hand side of A[ix] * A[iy] */
-047       tmpx        = a->dp[ix];
-048   
-049       /* alias for where to store the results */
-050       tmpt        = t.dp + (2*ix + 1);
-051       
-052       for (iy = ix + 1; iy < pa; iy++) \{
-053         /* first calculate the product */
-054         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
-055   
-056         /* now calculate the double precision result, note we use
-057          * addition instead of *2 since it's easier to optimize
-058          */
-059         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
-060   
-061         /* store lower part */
-062         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-063   
-064         /* get carry */
-065         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-066       \}
-067       /* propagate upwards */
-068       while (u != ((mp_digit) 0)) \{
-069         r       = ((mp_word) *tmpt) + ((mp_word) u);
-070         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-071         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
-072       \}
-073     \}
-074   
-075     mp_clamp (&t);
-076     mp_exch (&t, b);
-077     mp_clear (&t);
-078     return MP_OKAY;
-079   \}
-080   #endif
+018   int s_mp_sqr (mp_int * a, mp_int * b)
+019   \{
+020     mp_int  t;
+021     int     res, ix, iy, pa;
+022     mp_word r;
+023     mp_digit u, tmpx, *tmpt;
+024   
+025     pa = a->used;
+026     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
+027       return res;
+028     \}
+029   
+030     /* default used is maximum possible size */
+031     t.used = 2*pa + 1;
+032   
+033     for (ix = 0; ix < pa; ix++) \{
+034       /* first calculate the digit at 2*ix */
+035       /* calculate double precision result */
+036       r = ((mp_word) t.dp[2*ix]) +
+037           ((mp_word)a->dp[ix])*((mp_word)a->dp[ix]);
+038   
+039       /* store lower part in result */
+040       t.dp[ix+ix] = (mp_digit) (r & ((mp_word) MP_MASK));
+041   
+042       /* get the carry */
+043       u           = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+044   
+045       /* left hand side of A[ix] * A[iy] */
+046       tmpx        = a->dp[ix];
+047   
+048       /* alias for where to store the results */
+049       tmpt        = t.dp + (2*ix + 1);
+050       
+051       for (iy = ix + 1; iy < pa; iy++) \{
+052         /* first calculate the product */
+053         r       = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
+054   
+055         /* now calculate the double precision result, note we use
+056          * addition instead of *2 since it's easier to optimize
+057          */
+058         r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+059   
+060         /* store lower part */
+061         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+062   
+063         /* get carry */
+064         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+065       \}
+066       /* propagate upwards */
+067       while (u != ((mp_digit) 0)) \{
+068         r       = ((mp_word) *tmpt) + ((mp_word) u);
+069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
+070         u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+071       \}
+072     \}
+073   
+074     mp_clamp (&t);
+075     mp_exch (&t, b);
+076     mp_clear (&t);
+077     return MP_OKAY;
+078   \}
+079   #endif
 \end{alltt}
 \end{small}
 
-Inside the outer loop (\textit{see line 34}) the square term is calculated on line 37.  Line 44 extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 47 and 50 respectively.  The doubling is performed using two
-additions (\textit{see line 59}) since it is usually faster than shifting,if not at least as fast.  
+Inside the outer loop (line 33) the square term is calculated on line 36.  The carry (line 43) has been
+extracted from the mp\_word accumulator using a right shift.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized 
+(lines 46 and 49) to simplify the inner loop.  The doubling is performed using two
+additions (line 58) since it is usually faster than shifting, if not at least as fast.  
+
+The important observation is that the inner loop does not begin at $iy = 0$ like for multiplication.  As such the inner loops
+get progressively shorter as the algorithm proceeds.  This is what leads to the savings compared to using a multiplication to
+square a number. 
 
 \subsection{Faster Squaring by the ``Comba'' Method}
 A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
@@ -4986,9 +5028,9 @@ propagation operations from the inner loop.  However, the inner product must sti
 that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
 $ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
 
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
+However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two 
+mp\_word arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and 
+carry propagation can be moved to a $O(n)$ work level outside the $O(n^2)$ level.  In this case, we have an even simpler solution in mind.
 
 \newpage\begin{figure}[!here]
 \begin{small}
@@ -4998,34 +5040,34 @@ moved to a $O(n)$ work level outside the $O(n^2)$ level.
 \textbf{Input}.   mp\_int $a$ \\
 \textbf{Output}.  $b \leftarrow a^2$ \\
 \hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
+Place an array of \textbf{MP\_WARRAY} mp\_digits named $W$ on the stack. \\
 1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
 2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_{ix} \right )^2$ \\
 \\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
+3.  $pa \leftarrow 2 \cdot a.used$ \\
+4.  $\hat W1 \leftarrow 0$ \\
+5.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}5.1  $\_ \hat W \leftarrow 0$ \\
+\hspace{3mm}5.2  $ty \leftarrow \mbox{MIN}(a.used - 1, ix)$ \\
+\hspace{3mm}5.3  $tx \leftarrow ix - ty$ \\
+\hspace{3mm}5.4  $iy \leftarrow \mbox{MIN}(a.used - tx, ty + 1)$ \\
+\hspace{3mm}5.5  $iy \leftarrow \mbox{MIN}(iy, \lfloor \left (ty - tx + 1 \right )/2 \rfloor)$ \\
+\hspace{3mm}5.6  for $iz$ from $0$ to $iz - 1$ do \\
+\hspace{6mm}5.6.1  $\_ \hat W \leftarrow \_ \hat W + a_{tx + iz}a_{ty - iz}$ \\
+\hspace{3mm}5.7  $\_ \hat W \leftarrow 2 \cdot \_ \hat W  + \hat W1$ \\
+\hspace{3mm}5.8  if $ix$ is even then \\
+\hspace{6mm}5.8.1  $\_ \hat W \leftarrow \_ \hat W + \left ( a_{\lfloor ix/2 \rfloor}\right )^2$ \\
+\hspace{3mm}5.9  $W_{ix} \leftarrow \_ \hat W (\mbox{mod }\beta)$ \\
+\hspace{3mm}5.10  $\hat W1 \leftarrow \lfloor \_ \hat W / \beta \rfloor$ \\
 \\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
+6.  $oldused \leftarrow b.used$ \\
+7.  $b.used \leftarrow 2 \cdot a.used$ \\
+8.  for $ix$ from $0$ to $pa - 1$ do \\
+\hspace{3mm}8.1  $b_{ix} \leftarrow W_{ix}$ \\
+9.  for $ix$ from $pa$ to $oldused - 1$ do \\
+\hspace{3mm}9.1  $b_{ix} \leftarrow 0$ \\
+10.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
+11.  Return(\textit{MP\_OKAY}). \\ 
 \hline
 \end{tabular}
 \end{center}
@@ -5034,146 +5076,123 @@ Double the products and propagate the carries simultaneously. \\
 \end{figure}
 
 \textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm 
+s\_mp\_sqr when the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
+This algorithm is very similar to the Comba multiplier except with a few key differences we shall make note of.
 
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
+First, we have an accumulator and carry variables $\_ \hat W$ and $\hat W1$ respectively.  This is because the inner loop
+products are to be doubled.  If we had added the previous carry in we would be doubling too much.  Next we perform an
+addition MIN condition on $iy$ (step 5.5) to prevent overlapping digits.  For example, $a_3 \cdot a_5$ is equal
+$a_5 \cdot a_3$.  Whereas in the multiplication case we would have $5 < a.used$ and $3 \ge 0$ is maintained since we double the sum
+of the products just outside the inner loop we have to avoid doing this.  This is also a good thing since we perform
+fewer multiplications and the routine ends up being faster.
 
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
+Finally the last difference is the addition of the ``square'' term outside the inner loop (step 5.8).  We add in the square
+only to even outputs and it is the square of the term at the $\lfloor ix / 2 \rfloor$ position.
 
 \vspace{+3mm}\begin{small}
 \hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
 \vspace{-3mm}
 \begin{alltt}
 016   
-017   /* fast squaring
-018    *
-019    * This is the comba method where the columns of the product
-020    * are computed first then the carries are computed.  This
-021    * has the effect of making a very simple inner loop that
-022    * is executed the most
-023    *
-024    * W2 represents the outer products and W the inner.
-025    *
-026    * A further optimizations is made because the inner
-027    * products are of the form "A * B * 2".  The *2 part does
-028    * not need to be computed until the end which is good
-029    * because 64-bit shifts are slow!
-030    *
-031    * Based on Algorithm 14.16 on pp.597 of HAC.
-032    *
-033    */
-034   /* the jist of squaring...
-035   
-036   you do like mult except the offset of the tmpx [one that starts closer to ze
-      ro]
-037   can't equal the offset of tmpy.  So basically you set up iy like before then
-       you min it with
-038   (ty-tx) so that it never happens.  You double all those you add in the inner
-       loop
-039   
-040   After that loop you do the squares and add them in.
-041   
-042   Remove W2 and don't memset W
-043   
-044   */
-045   
-046   int fast_s_mp_sqr (mp_int * a, mp_int * b)
-047   \{
-048     int       olduse, res, pa, ix, iz;
-049     mp_digit   W[MP_WARRAY], *tmpx;
-050     mp_word   W1;
-051   
-052     /* grow the destination as required */
-053     pa = a->used + a->used;
-054     if (b->alloc < pa) \{
-055       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
-056         return res;
-057       \}
-058     \}
-059   
-060     /* number of output digits to produce */
-061     W1 = 0;
-062     for (ix = 0; ix < pa; ix++) \{ 
-063         int      tx, ty, iy;
-064         mp_word  _W;
-065         mp_digit *tmpy;
-066   
-067         /* clear counter */
-068         _W = 0;
+017   /* the jist of squaring...
+018    * you do like mult except the offset of the tmpx [one that 
+019    * starts closer to zero] can't equal the offset of tmpy.  
+020    * So basically you set up iy like before then you min it with
+021    * (ty-tx) so that it never happens.  You double all those 
+022    * you add in the inner loop
+023   
+024   After that loop you do the squares and add them in.
+025   */
+026   
+027   int fast_s_mp_sqr (mp_int * a, mp_int * b)
+028   \{
+029     int       olduse, res, pa, ix, iz;
+030     mp_digit   W[MP_WARRAY], *tmpx;
+031     mp_word   W1;
+032   
+033     /* grow the destination as required */
+034     pa = a->used + a->used;
+035     if (b->alloc < pa) \{
+036       if ((res = mp_grow (b, pa)) != MP_OKAY) \{
+037         return res;
+038       \}
+039     \}
+040   
+041     /* number of output digits to produce */
+042     W1 = 0;
+043     for (ix = 0; ix < pa; ix++) \{ 
+044         int      tx, ty, iy;
+045         mp_word  _W;
+046         mp_digit *tmpy;
+047   
+048         /* clear counter */
+049         _W = 0;
+050   
+051         /* get offsets into the two bignums */
+052         ty = MIN(a->used-1, ix);
+053         tx = ix - ty;
+054   
+055         /* setup temp aliases */
+056         tmpx = a->dp + tx;
+057         tmpy = a->dp + ty;
+058   
+059         /* this is the number of times the loop will iterrate, essentially
+060            while (tx++ < a->used && ty-- >= 0) \{ ... \}
+061          */
+062         iy = MIN(a->used-tx, ty+1);
+063   
+064         /* now for squaring tx can never equal ty 
+065          * we halve the distance since they approach at a rate of 2x
+066          * and we have to round because odd cases need to be executed
+067          */
+068         iy = MIN(iy, (ty-tx+1)>>1);
 069   
-070         /* get offsets into the two bignums */
-071         ty = MIN(a->used-1, ix);
-072         tx = ix - ty;
-073   
-074         /* setup temp aliases */
-075         tmpx = a->dp + tx;
-076         tmpy = a->dp + ty;
+070         /* execute loop */
+071         for (iz = 0; iz < iy; iz++) \{
+072            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
+073         \}
+074   
+075         /* double the inner product and add carry */
+076         _W = _W + _W + W1;
 077   
-078         /* this is the number of times the loop will iterrate, essentially its
-       
-079            while (tx++ < a->used && ty-- >= 0) \{ ... \}
-080          */
-081         iy = MIN(a->used-tx, ty+1);
+078         /* even columns have the square term in them */
+079         if ((ix&1) == 0) \{
+080            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
+081         \}
 082   
-083         /* now for squaring tx can never equal ty 
-084          * we halve the distance since they approach at a rate of 2x
-085          * and we have to round because odd cases need to be executed
-086          */
-087         iy = MIN(iy, (ty-tx+1)>>1);
-088   
-089         /* execute loop */
-090         for (iz = 0; iz < iy; iz++) \{
-091            _W += ((mp_word)*tmpx++)*((mp_word)*tmpy--);
-092         \}
+083         /* store it */
+084         W[ix] = (mp_digit)(_W & MP_MASK);
+085   
+086         /* make next carry */
+087         W1 = _W >> ((mp_word)DIGIT_BIT);
+088     \}
+089   
+090     /* setup dest */
+091     olduse  = b->used;
+092     b->used = a->used+a->used;
 093   
-094         /* double the inner product and add carry */
-095         _W = _W + _W + W1;
-096   
-097         /* even columns have the square term in them */
-098         if ((ix&1) == 0) \{
-099            _W += ((mp_word)a->dp[ix>>1])*((mp_word)a->dp[ix>>1]);
-100         \}
-101   
-102         /* store it */
-103         W[ix] = _W & MP_MASK;
-104   
-105         /* make next carry */
-106         W1 = _W >> ((mp_word)DIGIT_BIT);
-107     \}
-108   
-109     /* setup dest */
-110     olduse  = b->used;
-111     b->used = a->used+a->used;
-112   
-113     \{
-114       mp_digit *tmpb;
-115       tmpb = b->dp;
-116       for (ix = 0; ix < pa; ix++) \{
-117         *tmpb++ = W[ix] & MP_MASK;
-118       \}
-119   
-120       /* clear unused digits [that existed in the old copy of c] */
-121       for (; ix < olduse; ix++) \{
-122         *tmpb++ = 0;
-123       \}
-124     \}
-125     mp_clamp (b);
-126     return MP_OKAY;
-127   \}
-128   #endif
+094     \{
+095       mp_digit *tmpb;
+096       tmpb = b->dp;
+097       for (ix = 0; ix < pa; ix++) \{
+098         *tmpb++ = W[ix] & MP_MASK;
+099       \}
+100   
+101       /* clear unused digits [that existed in the old copy of c] */
+102       for (; ix < olduse; ix++) \{
+103         *tmpb++ = 0;
+104       \}
+105     \}
+106     mp_clamp (b);
+107     return MP_OKAY;
+108   \}
+109   #endif
 \end{alltt}
 \end{small}
 
--- Write something deep and insightful later, Tom.
+This implementation is essentially a copy of Comba multiplication with the appropriate changes added to make it faster for 
+the special case of squaring.  
 
 \subsection{Polynomial Basis Squaring}
 The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
@@ -5391,14 +5410,13 @@ By inlining the copy and shift operations the cutoff point for Karatsuba multipl
 is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
 it is actually below the Comba limit (\textit{at 110 digits}).
 
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
+This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are 
+redirected to the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and 
+mp\_clears are executed normally.
 
 \subsection{Toom-Cook Squaring}
 The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
+instead of multiplication to find the five relations.  The reader is encouraged to read the description of the latter algorithm and try to 
 derive their own Toom-Cook squaring algorithm.  
 
 \subsection{High Level Squaring}
@@ -5484,12 +5502,9 @@ neither of the polynomial basis algorithms should be used then either the Comba
 $\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
                       & that have different number of digits in Karatsuba multiplication. \\
                       & \\
-$\left [ 3 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
+$\left [ 2 \right ] $ & In section 5.3 the fact that every column of a squaring is made up \\
                       & of double products and at most one square is stated.  Prove this statement. \\
                       & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
 $\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
                       & \\
 $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
@@ -5497,6 +5512,14 @@ $\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3
 $\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
                       & required for equation $6.7$ to be true.  \\
                       & \\
+$\left [ 3 \right ] $ & Implement a threaded version of Comba multiplication (and squaring) where you \\
+                      & compute subsets of the columns in each thread.  Determine a cutoff point where \\
+                      & it is effective and add the logic to mp\_mul() and mp\_sqr(). \\
+                      &\\
+$\left [ 4 \right ] $ & Same as the previous but also modify the Karatsuba and Toom-Cook.  You must \\
+                      & increase the throughput of mp\_exptmod() for random odd moduli in the range \\
+                      & $512 \ldots 4096$ bits significantly ($> 2x$) to complete this challenge. \\
+                      & \\
 \end{tabular}
 
 \chapter{Modular Reduction}
@@ -5515,7 +5538,7 @@ other forms of residues.
 Modular reductions are normally used to create either finite groups, rings or fields.  The most common usage for performance driven modular reductions 
 is in modular exponentiation algorithms.  That is to compute $d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  This operation is used in the 
 RSA and Diffie-Hellman public key algorithms, for example.  Modular multiplication and squaring also appears as a fundamental operation in 
-Elliptic Curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
+elliptic curve cryptographic algorithms.  As will be discussed in the subsequent chapter there exist fast algorithms for computing modular 
 exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial results in the 
 range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.   They have also been used to create redundancy check 
 algorithms known as CRCs, error correction codes such as Reed-Solomon and solve a variety of number theoeretic problems.  
@@ -5749,11 +5772,11 @@ performed at most twice, and on average once. However, if $a \ge b^2$ than it wi
 038       \}
 039     \} else \{
 040   #ifdef BN_S_MP_MUL_HIGH_DIGS_C
-041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
+041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
 042         goto CLEANUP;
 043       \}
 044   #elif defined(BN_FAST_S_MP_MUL_HIGH_DIGS_C)
-045       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
+045       if ((res = fast_s_mp_mul_high_digs (&q, mu, &q, um)) != MP_OKAY) \{
 046         goto CLEANUP;
 047       \}
 048   #else 
@@ -5816,7 +5839,7 @@ safe to do so.
 In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
 future use so that the Barrett algorithm can be used without delay.  
 
-\begin{figure}[!here]
+\newpage\begin{figure}[!here]
 \begin{small}
 \begin{center}
 \begin{tabular}{l}
@@ -9293,14 +9316,14 @@ the integers from $0$ to $\beta - 1$.
 028   
 029     /* first place a random non-zero digit */
 030     do \{
-031       d = ((mp_digit) abs (rand ()));
+031       d = ((mp_digit) abs (rand ())) & MP_MASK;
 032     \} while (d == 0);
 033   
 034     if ((res = mp_add_d (a, d, a)) != MP_OKAY) \{
 035       return res;
 036     \}
 037   
-038     while (digits-- > 0) \{
+038     while (--digits > 0) \{
 039       if ((res = mp_lshd (a, 1)) != MP_OKAY) \{
 040         return res;
 041       \}
@@ -9959,6 +9982,8 @@ To explain the Jacobi Symbol we shall first discuss the Legendre function\footno
 defined.  The Legendre function computes whether or not an integer $a$ is a quadratic residue modulo an odd prime $p$.  Numerically it is
 equivalent to equation \ref{eqn:legendre}.
 
+\textit{-- Tom, don't be an ass, cite your source here...!}
+
 \begin{equation}
 a^{(p-1)/2} \equiv \begin{array}{rl}
                               -1 &  \mbox{if }a\mbox{ is a quadratic non-residue.} \\
diff --git a/tommath_class.h b/tommath_class.h
index 0ea8212..6d05b7b 100644
--- a/tommath_class.h
+++ b/tommath_class.h
@@ -137,7 +137,7 @@
    #define BN_MP_ISEVEN_C
    #define BN_MP_INIT_MULTI_C
    #define BN_MP_COPY_C
-   #define BN_MP_ABS_C
+   #define BN_MP_MOD_C
    #define BN_MP_SET_C
    #define BN_MP_DIV_2_C
    #define BN_MP_ISODD_C
@@ -366,6 +366,7 @@
    #define BN_MP_DIV_C
    #define BN_MP_MUL_C
    #define BN_MP_SUB_C
+   #define BN_MP_NEG_C
    #define BN_MP_EXCH_C
    #define BN_MP_CLEAR_MULTI_C
 #endif
@@ -440,6 +441,7 @@
 #if defined(BN_MP_INVMOD_SLOW_C)
    #define BN_MP_ISZERO_C
    #define BN_MP_INIT_MULTI_C
+   #define BN_MP_MOD_C
    #define BN_MP_COPY_C
    #define BN_MP_ISEVEN_C
    #define BN_MP_SET_C