added libtommath-0.13

2003-02-28 16:09:08 +00:00 · 2003-02-28 16:09:08 +00:00 · b66471f74f
commit b66471f74f
parent 57354e11ac
86 changed files with 754 additions and 521 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}

-\title{LibTomMath v0.12 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.13 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@ -409,7 +409,6 @@ of $c$ is the maximum length of the two inputs.
 Computes $c = a \land b$, pseudo-extends with zeroes whichever of $a$ or $b$ is shorter such that the length
 of $c$ is the maximum length of the two inputs.

-
 \subsection{Basic Arithmetic}

 \subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
@ -440,19 +439,18 @@ This function requires no additional memory and $O(N)$ time.
 Computes $c = a \cdot b$ using signed arithmetic.  Handles the sign of the numbers correctly which means it will 
 correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.

-For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used.  It
-requires no additional memory and $O(N^2)$ time.  The comba-baseline multiplier is only used if it can safely be used
-without losing carry digits.  The comba method is faster than the baseline method but cannot always be used which is why
-both are provided.  The code will automatically determine when it can be used.  If the digit count is higher
-than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and 
-$O(N^{lg(3)})$ time.  
+This function requires $O(N^2)$ time for small inputs and $O(N^{1.584})$ time for relatively large 
+inputs (\textit{above the }KARATSUBA\_MUL\_CUTOFF \textit{value defined in bncore.c.}).  There is 
+considerable overhead in the Karatsuba method which only pays off when the digit count is fairly high
+(\textit{typically around 80}).  For small inputs the function requires $O(2N)$ memory, otherwise it
+requires $O(6 \cdot \mbox{lg}(N) \cdot N)$ memory.
+

 \subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
-Computes $b = a^2$. 
-For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used.  It
-requires no additional memory and $O((N^2 + N)/2)$ time.  The comba-squaring method is used only if it can be safely used
-without losing carry digits.  After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately 
-$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
+Computes $b = a^2$ and fixes the sign of $b$ to be positive.
+
+This function has a running time and memory requirement profile very similar to that of the 
+mp\_mul function.  It is always faster and uses less memory for the larger inputs.

 \subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$.  The division is signed which means the sign
@ -482,7 +480,8 @@ Also note that these functions use mp\_mod which means the result are guaranteed

 \subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
 This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
-$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
+$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  If no inverse is found (e.g. $(a, b) \ne 1$) then
+the function returns \textbf{MP\_VAL} and the result in $c$ is undefined.

 \subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
 Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
@ -497,13 +496,13 @@ both.

 Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.

-\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
+\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int *c)}
 Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
 Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
 requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.

-If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
-that has a sign that agrees with the sign of $a$.
+If the input $a$ is negative and $b$ is even the function returns \textbf{MP\_VAL}.  Otherwise the function will 
+return a root that has a sign that agrees with the sign of $a$.

 \subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
 Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
--- a/bn_fast_mp_invmod.c
+++ b/bn_fast_mp_invmod.c
@ -14,13 +14,17 @@
 */
 #include <tommath.h>

-/* computes the modular inverse via binary extended euclidean algorithm, that is c = 1/a mod b */
+/* computes the modular inverse via binary extended euclidean algorithm, 
+ * that is c = 1/a mod b 
+ *
+ * Based on mp_invmod except this is optimized for the case where b is 
+ * odd as per HAC Note 14.64 on pp. 610
+ */
 int
 fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x, y, u, v, B, D;
-  int       res, neg;
-
+  mp_int  x, y, u, v, B, D;
+  int     res, neg;

  if ((res = mp_init (&x)) != MP_OKAY) {
    goto __ERR;
@ -58,7 +62,10 @@ fast_mp_invmod (mp_int * a, mp_int * b, mp_int * c)
    goto __D;
  }

-  /* 2. [modified] if x,y are both even then return an error! */
+  /* 2. [modified] if x,y are both even then return an error! 
+   * 
+   * That is if gcd(x,y) = 2 * k then obviously there is no inverse.
+   */
  if (mp_iseven (&x) == 1 && mp_iseven (&y) == 1) {
    res = MP_VAL;
    goto __D;
@ -135,8 +142,9 @@ top:
  }

  /* if not zero goto step 4 */
-  if (mp_iszero (&u) == 0)
+  if (mp_iszero (&u) == 0) {
    goto top;
+  }

  /* now a = C, b = D, gcd == g*v */

--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@ -14,12 +14,19 @@
 */
 #include <tommath.h>

-/* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
+/* computes xR^-1 == x (mod N) via Montgomery Reduction 
+ * 
+ * This is an optimized implementation of mp_montgomery_reduce 
+ * which uses the comba method to quickly calculate the columns of the
+ * reduction.  
+ *
+ * Based on Algorithm 14.32 on pp.601 of HAC.
+*/
 int
 fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
-  int       ix, res, olduse;
-  mp_word   W[512];
+  int     ix, res, olduse;
+  mp_word W[512];

  /* get old used count */
  olduse = a->used;
@ -31,14 +38,22 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
    }
  }

-  /* copy the digits of a */
-  for (ix = 0; ix < a->used; ix++) {
-    W[ix] = a->dp[ix];
-  }
+  {
+    register mp_word *_W;
+    register mp_digit *tmpa;

-  /* zero the high words */
-  for (; ix < m->used * 2 + 1; ix++) {
-    W[ix] = 0;
+    _W = W;
+    tmpa = a->dp;
+
+    /* copy the digits of a */
+    for (ix = 0; ix < a->used; ix++) {
+      *_W++ = *tmpa++;
+    }
+
+    /* zero the high words */
+    for (; ix < m->used * 2 + 1; ix++) {
+      *_W++ = 0;
+    }
  }

  for (ix = 0; ix < m->used; ix++) {
@ -69,8 +84,10 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
      register mp_digit *tmpx;
      register mp_word *_W;

-      /* aliases */
+      /* alias for the digits of the modulus */
      tmpx = m->dp;
+
+      /* Alias for the columns set by an offset of ix */
      _W = W + ix;

      /* inner loop */
@ -88,24 +105,32 @@ fast_mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
  }

-  /* copy out, A = A/b^n
-   *
-   * The result is A/b^n but instead of converting from an array of mp_word
-   * to mp_digit than calling mp_rshd we just copy them in the right
-   * order
-   */
-  for (ix = 0; ix < m->used + 1; ix++) {
-    a->dp[ix] = W[ix + m->used] & ((mp_word) MP_MASK);
+  {
+    register mp_digit *tmpa;
+    register mp_word *_W;
+
+    /* copy out, A = A/b^n
+     *
+     * The result is A/b^n but instead of converting from an array of mp_word
+     * to mp_digit than calling mp_rshd we just copy them in the right
+     * order
+     */
+    tmpa = a->dp;
+    _W = W + m->used;
+
+    for (ix = 0; ix < m->used + 1; ix++) {
+      *tmpa++ = *_W++ & ((mp_word) MP_MASK);
+    }
+
+    /* zero oldused digits, if the input a was larger than
+     * m->used+1 we'll have to clear the digits */
+    for (; ix < olduse; ix++) {
+      *tmpa++ = 0;
+    }
  }

-  /* set the max used */
+  /* set the max used and clamp */
  a->used = m->used + 1;
-
-  /* zero oldused digits, if the input a was larger than
-   * m->used+1 we'll have to clear the digits */
-  for (; ix < olduse; ix++) {
-    a->dp[ix] = 0;
-  }
  mp_clamp (a);

  /* if A >= m then A = A - m */
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@ -21,13 +21,20 @@
 * has the effect of making the nested loops that compute the columns very
 * simple and schedulable on super-scalar processors.
 *
+ * This has been modified to produce a variable number of digits of output so
+ * if say only a half-product is required you don't have to compute the upper half
+ * (a feature required for fast Barrett reduction).
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
+ *
 */
 int
 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int       olduse, res, pa, ix;
-  mp_word   W[512];
+  int     olduse, res, pa, ix;
+  mp_word W[512];

+  /* grow the destination as required */
  if (c->alloc < digs) {
    if ((res = mp_grow (c, digs)) != MP_OKAY) {
      return res;
@ -43,11 +50,9 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)

    /* this multiplier has been modified to allow you to control how many digits 
     * of output are produced.  So at most we want to make upto "digs" digits
-     * of output
-     */
-
-
-    /* this adds products to distinct columns (at ix+iy) of W
+     * of output.
+     *
+     * this adds products to distinct columns (at ix+iy) of W
     * note that each step through the loop is not dependent on
     * the previous which means the compiler can easily unroll
     * the loop without scheduling problems
@ -85,27 +90,30 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  olduse = c->used;
  c->used = digs;

+  {
+    register mp_digit *tmpc;

-  /* At this point W[] contains the sums of each column.  To get the
-   * correct result we must take the extra bits from each column and
-   * carry them down
-   *
-   * Note that while this adds extra code to the multiplier it saves time
-   * since the carry propagation is removed from the above nested loop.
-   * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
-   * N^2 + N*c where c is the cost of the shifting.  On very small numbers
-   * this is slower but on most cryptographic size numbers it is faster.
-   */
+    /* At this point W[] contains the sums of each column.  To get the
+     * correct result we must take the extra bits from each column and
+     * carry them down
+     *
+     * Note that while this adds extra code to the multiplier it saves time
+     * since the carry propagation is removed from the above nested loop.
+     * This has the effect of reducing the work from N*(N+N*c)==N^2 + c*N^2 to
+     * N^2 + N*c where c is the cost of the shifting.  On very small numbers
+     * this is slower but on most cryptographic size numbers it is faster.
+     */
+    tmpc = c->dp;
+    for (ix = 1; ix < digs; ix++) {
+      W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));

-  for (ix = 1; ix < digs; ix++) {
-    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-  }
-  c->dp[digs - 1] = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-
-  /* clear unused */
-  for (; ix < olduse; ix++) {
-    c->dp[ix] = 0;
+    /* clear unused */
+    for (; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
  }

  mp_clamp (c);
--- a/bn_fast_s_mp_mul_high_digs.c
+++ b/bn_fast_s_mp_mul_high_digs.c
@ -20,14 +20,16 @@
 *
 * This is used in the Barrett reduction since for one of the multiplications
 * only the higher digits were needed.  This essentially halves the work.
+ *
+ * Based on Algorithm 14.12 on pp.595 of HAC.
 */
 int
 fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  int       oldused, newused, res, pa, pb, ix;
-  mp_word   W[512];
-
+  int     oldused, newused, res, pa, pb, ix;
+  mp_word W[512];

+  /* calculate size of product and allocate more space if required */
  newused = a->used + b->used + 1;
  if (c->alloc < newused) {
    if ((res = mp_grow (c, newused)) != MP_OKAY) {
@ -38,7 +40,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  /* like the other comba method we compute the columns first */
  pa = a->used;
  pb = b->used;
-  memset (&W[digs], 0, (pa + pb + 1 - digs) * sizeof (mp_word));
+  memset (W + digs, 0, (pa + pb + 1 - digs) * sizeof (mp_word));
  for (ix = 0; ix < pa; ix++) {
    {
      register mp_digit tmpx, *tmpy;
@ -75,8 +77,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
    W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
    c->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
  }
-  c->dp[(pa + pb + 1) - 1] =
-    (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));
+  c->dp[(pa + pb + 1) - 1] = (mp_digit) (W[(pa + pb + 1) - 1] & ((mp_word) MP_MASK));

  for (; ix < oldused; ix++) {
    c->dp[ix] = 0;
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@ -26,14 +26,16 @@
 * "A * B * 2".  The *2 part does not need to be computed until the end which is
 * good because 64-bit shifts are slow!
 *
+ * Based on Algorithm 14.16 on pp.597 of HAC.
 *
 */
 int
 fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
-  int       olduse, newused, res, ix, pa;
-  mp_word   W2[512], W[512];
+  int     olduse, newused, res, ix, pa;
+  mp_word W2[512], W[512];

+  /* calculate size of product and allocate as required */
  pa = a->used;
  newused = pa + pa + 1;
  if (b->alloc < newused) {
@ -51,15 +53,31 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
   * the inner product can be doubled using n doublings instead of
   * n^2
   */
-  memset (W, 0,  newused * sizeof (mp_word));
+  memset (W, 0, newused * sizeof (mp_word));
  memset (W2, 0, newused * sizeof (mp_word));

+/* note optimization
+ * values in W2 are only written in even locations which means
+ * we can collapse the array to 256 words [and fixup the memset above]
+ * provided we also fix up the summations below.  Ideally
+ * the fixup loop should be unrolled twice to handle the even/odd 
+ * cases, and then a final step to handle odd cases [e.g. newused == odd]
+ *
+ * This will not only save ~8*256 = 2KB of stack but lower the number of
+ * operations required to finally fix up the columns
+ */
+
  /* This computes the inner product.  To simplify the inner N^2 loop
   * the multiplication by two is done afterwards in the N loop.
   */
  for (ix = 0; ix < pa; ix++) {
-     /* compute the outer product */
-    W2[ix + ix] += ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    /* compute the outer product 
+     *
+     * Note that every outer product is computed 
+     * for a particular column only once which means that 
+     * there is no need todo a double precision addition
+     */
+    W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);

    {
      register mp_digit tmpx, *tmpy;
@ -90,22 +108,25 @@ fast_s_mp_sqr (mp_int * a, mp_int * b)
  W[0] += W[0] + W2[0];

  /* now compute digits */
-  for (ix = 1; ix < newused; ix++) {
-    /* double/add next digit */
-    W[ix] += W[ix] + W2[ix];
+  {
+    register mp_digit *tmpb;

-    W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-    b->dp[ix - 1] = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    tmpb = b->dp;
+
+    for (ix = 1; ix < newused; ix++) {
+      /* double/add next digit */
+      W[ix] += W[ix] + W2[ix];
+
+      W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
+      *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
+    }
+    *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
+
+    /* clear high */
+    for (; ix < olduse; ix++) {
+      *tmpb++ = 0;
+    }
  }
-  b->dp[(newused) - 1] = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-
-  /* clear high */
-  for (; ix < olduse; ix++) {
-    b->dp[ix] = 0;
-  }
-
-  /* fix the sign (since we no longer make a fresh temp) */
-  b->sign = MP_ZPOS;

  mp_clamp (b);
  return MP_OKAY;
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@ -14,11 +14,15 @@
 */
 #include <tommath.h>

-/* computes a = 2^b */
+/* computes a = 2^b 
+ *
+ * Simple algorithm which zeroes the int, grows it then just sets one bit
+ * as required.
+ */
 int
 mp_2expt (mp_int * a, int b)
 {
-  int       res;
+  int     res;

  mp_zero (a);
  if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) {
--- a/bn_mp_abs.c
+++ b/bn_mp_abs.c
@ -14,11 +14,14 @@
 */
 #include <tommath.h>

-/* b = |a| */
+/* b = |a| 
+ *
+ * Simple function copies the input and fixes the sign to positive
+ */
 int
 mp_abs (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
--- a/bn_mp_add.c
+++ b/bn_mp_add.c
@ -18,9 +18,9 @@
 int
 mp_add (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       sa, sb, res;
-
+  int     sa, sb, res;

+  /* get sign of both inputs */
  sa = a->sign;
  sb = b->sign;

--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@ -18,9 +18,8 @@
 int
 mp_add_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
-
+  mp_int  t;
+  int     res;

  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
--- a/bn_mp_addmod.c
+++ b/bn_mp_addmod.c
@ -18,9 +18,8 @@
 int
 mp_addmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
-
+  int     res;
+  mp_int  t;

  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
--- a/bn_mp_and.c
+++ b/bn_mp_and.c
@ -18,8 +18,8 @@
 int
 mp_and (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, ix, px;
-  mp_int    t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;

  if (a->used > b->used) {
    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
--- a/bn_mp_clamp.c
+++ b/bn_mp_clamp.c
@ -14,7 +14,13 @@
 */
 #include <tommath.h>

-/* trim unused digits */
+/* trim unused digits 
+ *
+ * This is used to ensure that leading zero digits are
+ * trimed and the leading "used" digit will be non-zero
+ * Typically very fast.  Also fixes the sign if there
+ * are no more leading digits
+ */
 void
 mp_clamp (mp_int * a)
 {
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@ -18,13 +18,11 @@
 int
 mp_cmp (mp_int * a, mp_int * b)
 {
-  int       res;
  /* compare based on sign */
  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
    return MP_LT;
  } else if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
    return MP_GT;
  }
-  res = mp_cmp_mag (a, b);
-  return res;
+  return mp_cmp_mag (a, b);
 }
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@ -18,8 +18,7 @@
 int
 mp_cmp_mag (mp_int * a, mp_int * b)
 {
-  int       n;
-
+  int     n;

  /* compare based on # of non-zero digits */
  if (a->used > b->used) {
--- a/bn_mp_copy.c
+++ b/bn_mp_copy.c
@ -18,8 +18,7 @@
 int
 mp_copy (mp_int * a, mp_int * b)
 {
-  int       res, n;
-
+  int     res, n;

  /* if dst == src do nothing */
  if (a == b || a->dp == b->dp) {
@ -35,14 +34,21 @@ mp_copy (mp_int * a, mp_int * b)
  b->used = a->used;
  b->sign = a->sign;

-  /* copy all the digits */
-  for (n = 0; n < a->used; n++) {
-    b->dp[n] = a->dp[n];
-  }
+  {
+    register mp_digit *tmpa, *tmpb;

-  /* clear high digits */
-  for (n = b->used; n < b->alloc; n++) {
-    b->dp[n] = 0;
+    tmpa = a->dp;
+    tmpb = b->dp;
+
+    /* copy all the digits */
+    for (n = 0; n < a->used; n++) {
+      *tmpb++ = *tmpa++;
+    }
+
+    /* clear high digits */
+    for (; n < b->alloc; n++) {
+      *tmpb++ = 0;
+    }
  }
  return MP_OKAY;
 }
--- a/bn_mp_count_bits.c
+++ b/bn_mp_count_bits.c
@ -18,8 +18,8 @@
 int
 mp_count_bits (mp_int * a)
 {
-  int       r;
-  mp_digit  q;
+  int     r;
+  mp_digit q;

  if (a->used == 0) {
    return 0;
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@ -26,8 +26,8 @@
 int
 mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  mp_int    q, x, y, t1, t2;
-  int       res, n, t, i, norm, neg;
+  mp_int  q, x, y, t1, t2;
+  int     res, n, t, i, norm, neg;


  /* is divisor zero ? */
@ -75,13 +75,12 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)

  /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
  norm = 0;
-  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) ==
-	 ((mp_digit) 0)) {
+  while ((y.dp[y.used - 1] & (((mp_digit) 1) << (DIGIT_BIT - 1))) == ((mp_digit) 0)) {
    ++norm;
-    if ((res = mp_mul_2d (&x, 1, &x)) != MP_OKAY) {
+    if ((res = mp_mul_2 (&x, &x)) != MP_OKAY) {
      goto __Y;
    }
-    if ((res = mp_mul_2d (&y, 1, &y)) != MP_OKAY) {
+    if ((res = mp_mul_2 (&y, &y)) != MP_OKAY) {
      goto __Y;
    }
  }
@ -114,7 +113,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
    if (x.dp[i] == y.dp[t]) {
      q.dp[i - t - 1] = ((1UL << DIGIT_BIT) - 1UL);
    } else {
-      mp_word   tmp;
+      mp_word tmp;
      tmp = ((mp_word) x.dp[i]) << ((mp_word) DIGIT_BIT);
      tmp |= ((mp_word) x.dp[i - 1]);
      tmp /= ((mp_word) y.dp[t]);
@ -142,8 +141,7 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
      t2.dp[1] = (i - 1 < 0) ? 0 : x.dp[i - 1];
      t2.dp[2] = x.dp[i];
      t2.used = 3;
-    }
-    while (mp_cmp (&t1, &t2) == MP_GT);
+    } while (mp_cmp (&t1, &t2) == MP_GT);

    /* step 3.3 x = x - q{i-t-1} * y * b^{i-t-1} */
    if ((res = mp_mul_d (&y, q.dp[i - t - 1], &t1)) != MP_OKAY) {
--- a/bn_mp_div_2.c
+++ b/bn_mp_div_2.c
@ -18,20 +18,33 @@
 int
 mp_div_2 (mp_int * a, mp_int * b)
 {
-  mp_digit  r, rr;
-  int       x, res;
-
+  int     x, res, oldused;

  /* copy */
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
  }

-  r = 0;
-  for (x = b->used - 1; x >= 0; x--) {
-    rr = b->dp[x] & 1;
-    b->dp[x] = (b->dp[x] >> 1) | (r << (DIGIT_BIT - 1));
-    r = rr;
+  oldused = b->used;
+  b->used = a->used;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    tmpa = a->dp + b->used - 1;
+    tmpb = b->dp + b->used - 1;
+    r = 0;
+    for (x = b->used - 1; x >= 0; x--) {
+      rr = *tmpa & 1;
+      *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
+      r = rr;
+    }
+
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
  }
  mp_clamp (b);
  return MP_OKAY;
--- a/bn_mp_div_2d.c
+++ b/bn_mp_div_2d.c
@ -18,9 +18,9 @@
 int
 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
 {
-  mp_digit  D, r, rr;
-  int       x, res;
-  mp_int    t;
+  mp_digit D, r, rr;
+  int     x, res;
+  mp_int  t;


  /* if the shift count is <= 0 then we do no work */
--- a/bn_mp_div_d.c
+++ b/bn_mp_div_d.c
@ -18,8 +18,8 @@
 int
 mp_div_d (mp_int * a, mp_digit b, mp_int * c, mp_digit * d)
 {
-  mp_int    t, t2;
-  int       res;
+  mp_int  t, t2;
+  int     res;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_exch.c
+++ b/bn_mp_exch.c
@ -17,7 +17,7 @@
 void
 mp_exch (mp_int * a, mp_int * b)
 {
-  mp_int    t;
+  mp_int  t;

  t = *a;
  *a = *b;
--- a/bn_mp_expt_d.c
+++ b/bn_mp_expt_d.c
@ -17,8 +17,8 @@
 int
 mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int       res, x;
-  mp_int    g;
+  int     res, x;
+  mp_int  g;


  if ((res = mp_init_copy (&g, a)) != MP_OKAY) {
--- a/bn_mp_exptmod.c
+++ b/bn_mp_exptmod.c
@ -14,19 +14,30 @@
 */
 #include <tommath.h>

+static int f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
+
+/* this is a shell function that calls either the normal or Montgomery
+ * exptmod functions.  Originally the call to the montgomery code was 
+ * embedded in the normal function but that wasted alot of stack space
+ * for nothing (since 99% of the time the Montgomery code would be called)
+ */
 int
 mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
-  mp_int    M[256], res, mu;
-  mp_digit  buf;
-  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-
-
  /* if the modulus is odd use the fast method */
  if (mp_isodd (P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
-    err = mp_exptmod_fast (G, X, P, Y);
-    return err;
+    return mp_exptmod_fast (G, X, P, Y);
+  } else {
+    return f_mp_exptmod (G, X, P, Y);
  }
+}
+
+static int
+f_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
+{
+  mp_int  M[256], res, mu;
+  mp_digit buf;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;

  /* find window size */
  x = mp_count_bits (X);
@ -80,9 +91,7 @@ mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
  }

  for (x = 0; x < (winsize - 1); x++) {
-    if ((err =
-	 mp_sqr (&M[1 << (winsize - 1)],
-		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
      goto __MU;
    }
    if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) {
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@ -24,9 +24,9 @@
 int
 mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 {
-  mp_int    M[256], res;
-  mp_digit  buf, mp;
-  int       err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
+  mp_int  M[256], res;
+  mp_digit buf, mp;
+  int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;

  /* find window size */
  x = mp_count_bits (X);
@ -48,7 +48,7 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)

  /* init G array */
  for (x = 0; x < (1 << winsize); x++) {
-    if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) {
+    if ((err = mp_init (&M[x])) != MP_OKAY) {
      for (y = 0; y < x; y++) {
 	mp_clear (&M[y]);
      }
@ -66,44 +66,32 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
    goto __RES;
  }

-  /* now we need R mod m */
-  if ((err = mp_2expt (&res, P->used * DIGIT_BIT)) != MP_OKAY) {
-    goto __RES;
-  }
-
-  /* res = R mod m (can use modified double/subtract ...) */
-  if ((err = mp_mod (&res, P, &res)) != MP_OKAY) {
-    goto __RES;
-  }
-
  /* create M table
   *
   * The M table contains powers of the input base, e.g. M[x] = G^x mod P
   *
   * The first half of the table is not computed though accept for M[0] and M[1]
   */
-  if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) {
+
+  /* now we need R mod m */
+  if ((err = mp_montgomery_calc_normalization (&res, P)) != MP_OKAY) {
    goto __RES;
  }

  /* now set M[1] to G * R mod m */
-  if ((err = mp_mulmod (&M[1], &res, P, &M[1])) != MP_OKAY) {
+  if ((err = mp_mulmod (G, &res, P, &M[1])) != MP_OKAY) {
    goto __RES;
  }
-
  /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
  if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) {
    goto __RES;
  }

  for (x = 0; x < (winsize - 1); x++) {
-    if ((err =
-	 mp_sqr (&M[1 << (winsize - 1)],
-		 &M[1 << (winsize - 1)])) != MP_OKAY) {
+    if ((err = mp_sqr (&M[1 << (winsize - 1)], &M[1 << (winsize - 1)])) != MP_OKAY) {
      goto __RES;
    }
-    if ((err =
-	 mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
+    if ((err = mp_montgomery_reduce (&M[1 << (winsize - 1)], P, mp)) != MP_OKAY) {
      goto __RES;
    }
  }
--- a/bn_mp_gcd.c
+++ b/bn_mp_gcd.c
@ -19,8 +19,8 @@
 int
 mp_gcd (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    u, v, t;
-  int       k, res, neg;
+  mp_int  u, v, t;
+  int     k, res, neg;


  /* either zero than gcd is the largest */
@ -57,10 +57,10 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
  k = 0;
  while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
    ++k;
-    if ((res = mp_div_2d (&u, 1, &u, NULL)) != MP_OKAY) {
+    if ((res = mp_div_2 (&u, &u)) != MP_OKAY) {
      goto __T;
    }
-    if ((res = mp_div_2d (&v, 1, &v, NULL)) != MP_OKAY) {
+    if ((res = mp_div_2 (&v, &v)) != MP_OKAY) {
      goto __T;
    }
  }
@ -80,7 +80,7 @@ mp_gcd (mp_int * a, mp_int * b, mp_int * c)
  do {
    /* B3 (and B4).  Halve t, if even */
    while (t.used != 0 && (t.dp[0] & 1) == 0) {
-      if ((res = mp_div_2d (&t, 1, &t, NULL)) != MP_OKAY) {
+      if ((res = mp_div_2 (&t, &t)) != MP_OKAY) {
 	goto __T;
      }
    }
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@ -18,8 +18,7 @@
 int
 mp_grow (mp_int * a, int size)
 {
-  int       i, n;
-
+  int     i, n;

  /* if the alloc size is smaller alloc more ram */
  if (a->alloc < size) {
--- a/bn_mp_init_copy.c
+++ b/bn_mp_init_copy.c
@ -18,11 +18,10 @@
 int
 mp_init_copy (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;

  if ((res = mp_init (a)) != MP_OKAY) {
    return res;
  }
-  res = mp_copy (b, a);
-  return res;
+  return mp_copy (b, a);
 }
--- a/bn_mp_invmod.c
+++ b/bn_mp_invmod.c
@ -17,9 +17,8 @@
 int
 mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x, y, u, v, A, B, C, D;
-  int       res;
-
+  mp_int  x, y, u, v, A, B, C, D;
+  int     res;

  /* b cannot be negative */
  if (b->sign == MP_NEG) {
@ -28,8 +27,7 @@ mp_invmod (mp_int * a, mp_int * b, mp_int * c)

  /* if the modulus is odd we can use a faster routine instead */
  if (mp_iseven (b) == 0) {
-    res = fast_mp_invmod (a, b, c);
-    return res;
+    return fast_mp_invmod (a, b, c);
  }

  if ((res = mp_init (&x)) != MP_OKAY) {
--- a/bn_mp_jacobi.c
+++ b/bn_mp_jacobi.c
@ -20,9 +20,9 @@
 int
 mp_jacobi (mp_int * a, mp_int * n, int *c)
 {
-  mp_int    a1, n1, e;
-  int       s, r, res;
-  mp_digit  residue;
+  mp_int  a1, n1, e;
+  int     s, r, res;
+  mp_digit residue;

  /* step 1.  if a == 0, return 0 */
  if (mp_iszero (a) == 1) {
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -36,8 +36,8 @@
 int
 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    x0, x1, y0, y1, t1, t2, x0y0, x1y1;
-  int       B, err, x;
+  mp_int  x0, x1, y0, y1, t1, t2, x0y0, x1y1;
+  int     B, err, x;


  err = MP_MEM;
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@ -22,8 +22,8 @@
 int
 mp_karatsuba_sqr (mp_int * a, mp_int * b)
 {
-  mp_int    x0, x1, t1, t2, x0x0, x1x1;
-  int       B, err, x;
+  mp_int  x0, x1, t1, t2, x0x0, x1x1;
+  int     B, err, x;


  err = MP_MEM;
--- a/bn_mp_lcm.c
+++ b/bn_mp_lcm.c
@ -18,8 +18,8 @@
 int
 mp_lcm (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_lshd.c
+++ b/bn_mp_lshd.c
@ -18,7 +18,7 @@
 int
 mp_lshd (mp_int * a, int b)
 {
-  int       x, res;
+  int     x, res;


  /* if its less than zero return */
--- a/bn_mp_mod.c
+++ b/bn_mp_mod.c
@ -18,8 +18,8 @@
 int
 mp_mod (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
+  mp_int  t;
+  int     res;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_mod_2d.c
+++ b/bn_mp_mod_2d.c
@ -18,7 +18,7 @@
 int
 mp_mod_2d (mp_int * a, int b, mp_int * c)
 {
-  int       x, res;
+  int     x, res;


  /* if b is <= 0 then zero the int */
@ -44,8 +44,7 @@ mp_mod_2d (mp_int * a, int b, mp_int * c)
  }
  /* clear the digit that is not completely outside/inside the modulus */
  c->dp[b / DIGIT_BIT] &=
-    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) -
-		((mp_digit) 1));
+    (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digit) 1));
  mp_clamp (c);
  return MP_OKAY;
 }
--- a/bn_mp_mod_d.c
+++ b/bn_mp_mod_d.c
@ -17,8 +17,8 @@
 int
 mp_mod_d (mp_int * a, mp_digit b, mp_digit * c)
 {
-  mp_int    t, t2;
-  int       res;
+  mp_int  t, t2;
+  int     res;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_montgomery_calc_normalization.c
+++ b/bn_mp_montgomery_calc_normalization.c
@ -0,0 +1,53 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is library that provides for multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library is designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
+ */
+#include <tommath.h>
+
+/* calculates a = B^n mod b for Montgomery reduction
+ * Where B is the base [e.g. 2^DIGIT_BIT].  
+ * B^n mod b is computed by first computing
+ * A = B^(n-1) which doesn't require a reduction but a simple OR.
+ * then C = A * B = B^n is computed by performing upto DIGIT_BIT 
+ * shifts with subtractions when the result is greater than b.
+ *
+ * The method is slightly modified to shift B unconditionally upto just under
+ * the leading bit of b.  This saves alot of multiple precision shifting.
+ */
+int
+mp_montgomery_calc_normalization (mp_int * a, mp_int * b)
+{
+  int     x, bits, res;
+
+  /* how many bits of last digit does b use */
+  bits = mp_count_bits (b) % DIGIT_BIT;
+
+  /* compute A = B^(n-1) * 2^(bits-1) */
+  if ((res = mp_2expt (a, (b->used - 1) * DIGIT_BIT + bits - 1)) != MP_OKAY) {
+    return res;
+  }
+
+  /* now compute C = A * B mod b */
+  for (x = bits - 1; x < DIGIT_BIT; x++) {
+    if ((res = mp_mul_2 (a, a)) != MP_OKAY) {
+      return res;
+    }
+    if (mp_cmp_mag (a, b) != MP_LT) {
+      if ((res = s_mp_sub (a, b, a)) != MP_OKAY) {
+	return res;
+      }
+    }
+  }
+
+  return MP_OKAY;
+}
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@ -18,14 +18,13 @@
 int
 mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)
 {
-  int       ix, res, digs;
-  mp_digit  ui;
+  int     ix, res, digs;
+  mp_digit ui;

  digs = m->used * 2 + 1;
  if ((digs < 512)
      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_mp_montgomery_reduce (a, m, mp);
-    return res;
+    return fast_mp_montgomery_reduce (a, m, mp);
  }

  if (a->alloc < m->used * 2 + 1) {
@ -51,9 +50,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)

      mu = 0;
      for (iy = 0; iy < m->used; iy++) {
-	r =
-	  ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) +
-	  ((mp_word) * tmpy);
+	r = ((mp_word) ui) * ((mp_word) * tmpx++) + ((mp_word) mu) + ((mp_word) * tmpy);
 	mu = (r >> ((mp_word) DIGIT_BIT));
 	*tmpy++ = (r & ((mp_word) MP_MASK));
      }
@ -71,9 +68,7 @@ mp_montgomery_reduce (mp_int * a, mp_int * m, mp_digit mp)

  /* if A >= m then A = A - m */
  if (mp_cmp_mag (a, m) != MP_LT) {
-    if ((res = s_mp_sub (a, m, a)) != MP_OKAY) {
-      return res;
-    }
+    return s_mp_sub (a, m, a);
  }

  return MP_OKAY;
--- a/bn_mp_montgomery_setup.c
+++ b/bn_mp_montgomery_setup.c
@ -18,8 +18,8 @@
 int
 mp_montgomery_setup (mp_int * a, mp_digit * mp)
 {
-  mp_int    t, tt;
-  int       res;
+  mp_int  t, tt;
+  int     res;

  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -18,12 +18,26 @@
 int
 mp_mul (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, neg;
+  int     res, neg;
  neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
  if (MIN (a->used, b->used) > KARATSUBA_MUL_CUTOFF) {
    res = mp_karatsuba_mul (a, b, c);
  } else {
-    res = s_mp_mul (a, b, c);
+
+    /* can we use the fast multiplier? 
+     *
+     * The fast multiplier can be used if the output will have less than 
+     * 512 digits and the number of digits won't affect carry propagation
+     */
+    int     digs = a->used + b->used + 1;
+
+    if ((digs < 512)
+	&& digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+      res = fast_s_mp_mul_digs (a, b, c, digs);
+    } else {
+      res = s_mp_mul (a, b, c);
+    }
+
  }
  c->sign = neg;
  return res;
--- a/bn_mp_mul_2.c
+++ b/bn_mp_mul_2.c
@ -18,27 +18,48 @@
 int
 mp_mul_2 (mp_int * a, mp_int * b)
 {
-  mp_digit  r, rr;
-  int       x, res;
+  int     x, res, oldused;

+  /* Optimization: should copy and shift at the same time */

-  /* copy */
-  if ((res = mp_copy (a, b)) != MP_OKAY) {
-    return res;
+  if (b->alloc < a->used) {
+    if ((res = mp_grow (b, a->used)) != MP_OKAY) {
+      return res;
+    }
  }

-  if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
-    return res;
-  }
-  ++b->used;
+  oldused = b->used;
+  b->used = a->used;

  /* shift any bit count < DIGIT_BIT */
-  r = 0;
-  for (x = 0; x < b->used; x++) {
-    rr = (b->dp[x] >> (DIGIT_BIT - 1)) & 1;
-    b->dp[x] = ((b->dp[x] << 1) | r) & MP_MASK;
-    r = rr;
+  {
+    register mp_digit r, rr, *tmpa, *tmpb;
+
+    r = 0;
+    tmpa = a->dp;
+    tmpb = b->dp;
+    for (x = 0; x < b->used; x++) {
+      rr = *tmpa >> (DIGIT_BIT - 1);
+      *tmpb++ = ((*tmpa++ << 1) | r) & MP_MASK;
+      r = rr;
+    }
+
+    /* new leading digit? */
+    if (r != 0) {
+      if (b->alloc == b->used) {
+	if ((res = mp_grow (b, b->used + 1)) != MP_OKAY) {
+	  return res;
+	}
+      }
+      /* add a MSB of 1 */
+      *tmpb = 1;
+      ++b->used;
+    }
+
+    tmpb = b->dp + b->used;
+    for (x = b->used; x < oldused; x++) {
+      *tmpb++ = 0;
+    }
  }
-  mp_clamp (b);
  return MP_OKAY;
 }
--- a/bn_mp_mul_2d.c
+++ b/bn_mp_mul_2d.c
@ -18,8 +18,8 @@
 int
 mp_mul_2d (mp_int * a, int b, mp_int * c)
 {
-  mp_digit  d, r, rr;
-  int       x, res;
+  mp_digit d, r, rr;
+  int     x, res;


  /* copy */
--- a/bn_mp_mul_d.c
+++ b/bn_mp_mul_d.c
@ -18,29 +18,40 @@
 int
 mp_mul_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  int       res, pa, ix;
-  mp_word   r;
-  mp_digit  u;
-  mp_int    t;
-
+  int     res, pa, olduse;

  pa = a->used;
-  if ((res = mp_init_size (&t, pa + 2)) != MP_OKAY) {
-    return res;
+  if (c->alloc < pa + 1) {
+    if ((res = mp_grow (c, pa + 1)) != MP_OKAY) {
+      return res;
+    }
  }
-  t.used = pa + 2;

-  u = 0;
-  for (ix = 0; ix < pa; ix++) {
-    r = ((mp_word) u) + ((mp_word) a->dp[ix]) * ((mp_word) b);
-    t.dp[ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-    u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+  olduse = c->used;
+  c->used = pa + 1;
+
+  {
+    register mp_digit u, *tmpa, *tmpc;
+    register mp_word r;
+    register int ix;
+
+    tmpc = c->dp + c->used;
+    for (ix = c->used; ix < olduse; ix++) {
+      *tmpc++ = 0;
+    }
+
+    tmpa = a->dp;
+    tmpc = c->dp;
+
+    u = 0;
+    for (ix = 0; ix < pa; ix++) {
+      r = ((mp_word) u) + ((mp_word) * tmpa++) * ((mp_word) b);
+      *tmpc++ = (mp_digit) (r & ((mp_word) MP_MASK));
+      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+    }
+    *tmpc = u;
  }
-  t.dp[ix] = u;

-  t.sign = a->sign;
-  mp_clamp (&t);
-  mp_exch (&t, c);
-  mp_clear (&t);
+  mp_clamp (c);
  return MP_OKAY;
 }
--- a/bn_mp_mulmod.c
+++ b/bn_mp_mulmod.c
@ -18,8 +18,8 @@
 int
 mp_mulmod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_n_root.c
+++ b/bn_mp_n_root.c
@ -17,12 +17,16 @@
 /* find the n'th root of an integer 
 *
 * Result found such that (c)^b <= a and (c+1)^b > a 
+ *
+ * This algorithm uses Newton's approximation x[i+1] = x[i] - f(x[i])/f'(x[i]) 
+ * which will find the root in log(N) time where each step involves a fair bit.  This
+ * is not meant to find huge roots [square and cube at most].
 */
 int
 mp_n_root (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t1, t2, t3;
-  int       res, neg;
+  mp_int  t1, t2, t3;
+  int     res, neg;

  /* input must be positive if b is even */
  if ((b & 1) == 0 && a->sign == MP_NEG) {
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@ -18,7 +18,7 @@
 int
 mp_neg (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
--- a/bn_mp_or.c
+++ b/bn_mp_or.c
@ -18,8 +18,8 @@
 int
 mp_or (mp_int * a, mp_int * b, mp_int * c)
 {
-  int res, ix, px;
-  mp_int t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;

  if (a->used > b->used) {
    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
--- a/bn_mp_rand.c
+++ b/bn_mp_rand.c
@ -18,8 +18,8 @@
 int
 mp_rand (mp_int * a, int digits)
 {
-  int       res;
-  mp_digit  d;
+  int     res;
+  mp_digit d;

  mp_zero (a);
  if (digits <= 0) {
@ -27,19 +27,20 @@ mp_rand (mp_int * a, int digits)
  }

  /* first place a random non-zero digit */
-  d = ((mp_digit) abs (rand ()));
-  d = d == 0 ? 1 : d;
+  do {
+    d = ((mp_digit) abs (rand ()));
+  } while (d == 0);

  if ((res = mp_add_d (a, d, a)) != MP_OKAY) {
    return res;
  }

-
  while (digits-- > 0) {
-    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
+    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
      return res;
    }
-    if ((res = mp_lshd (a, 1)) != MP_OKAY) {
+
+    if ((res = mp_add_d (a, ((mp_digit) abs (rand ())), a)) != MP_OKAY) {
      return res;
    }
  }
--- a/bn_mp_read_signed_bin.c
+++ b/bn_mp_read_signed_bin.c
@ -18,7 +18,7 @@
 int
 mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
 {
-  int       res;
+  int     res;

  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
    return res;
--- a/bn_mp_read_unsigned_bin.c
+++ b/bn_mp_read_unsigned_bin.c
@ -18,7 +18,7 @@
 int
 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
 {
-  int       res;
+  int     res;
  mp_zero (a);
  while (c-- > 0) {
    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
--- a/bn_mp_reduce.c
+++ b/bn_mp_reduce.c
@ -20,7 +20,7 @@
 int
 mp_reduce_setup (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;


  if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
@ -36,8 +36,8 @@ mp_reduce_setup (mp_int * a, mp_int * b)
 int
 mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
 {
-  mp_int    q;
-  int       res, um = m->used;
+  mp_int  q;
+  int     res, um = m->used;


  if ((res = mp_init_copy (&q, x)) != MP_OKAY) {
--- a/bn_mp_rshd.c
+++ b/bn_mp_rshd.c
@ -18,7 +18,7 @@
 void
 mp_rshd (mp_int * a, int b)
 {
-  int       x;
+  int     x;


  /* if b <= 0 then ignore it */
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@ -18,7 +18,7 @@
 int
 mp_set_int (mp_int * a, unsigned long b)
 {
-  int       x, res;
+  int     x, res;

  mp_zero (a);

--- a/bn_mp_sqr.c
+++ b/bn_mp_sqr.c
@ -18,11 +18,18 @@
 int
 mp_sqr (mp_int * a, mp_int * b)
 {
-  int       res;
+  int     res;
  if (a->used > KARATSUBA_SQR_CUTOFF) {
    res = mp_karatsuba_sqr (a, b);
  } else {
-    res = s_mp_sqr (a, b);
+
+    /* can we use the fast multiplier? */
+    if (((a->used * 2 + 1) < 512)
+	&& a->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
+      res = fast_s_mp_sqr (a, b);
+    } else {
+      res = s_mp_sqr (a, b);
+    }
  }
  b->sign = MP_ZPOS;
  return res;
--- a/bn_mp_sqrmod.c
+++ b/bn_mp_sqrmod.c
@ -18,8 +18,8 @@
 int
 mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_sub.c
+++ b/bn_mp_sub.c
@ -18,7 +18,7 @@
 int
 mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       sa, sb, res;
+  int     sa, sb, res;


  sa = a->sign;
--- a/bn_mp_sub_d.c
+++ b/bn_mp_sub_d.c
@ -18,8 +18,8 @@
 int
 mp_sub_d (mp_int * a, mp_digit b, mp_int * c)
 {
-  mp_int    t;
-  int       res;
+  mp_int  t;
+  int     res;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_submod.c
+++ b/bn_mp_submod.c
@ -18,8 +18,8 @@
 int
 mp_submod (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
 {
-  int       res;
-  mp_int    t;
+  int     res;
+  mp_int  t;


  if ((res = mp_init (&t)) != MP_OKAY) {
--- a/bn_mp_to_signed_bin.c
+++ b/bn_mp_to_signed_bin.c
@ -18,7 +18,7 @@
 int
 mp_to_signed_bin (mp_int * a, unsigned char *b)
 {
-  int       res;
+  int     res;

  if ((res = mp_to_unsigned_bin (a, b + 1)) != MP_OKAY) {
    return res;
--- a/bn_mp_to_unsigned_bin.c
+++ b/bn_mp_to_unsigned_bin.c
@ -18,8 +18,8 @@
 int
 mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 {
-  int       x, res;
-  mp_int    t;
+  int     x, res;
+  mp_int  t;

  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
    return res;
--- a/bn_mp_unsigned_bin_size.c
+++ b/bn_mp_unsigned_bin_size.c
@ -18,6 +18,6 @@
 int
 mp_unsigned_bin_size (mp_int * a)
 {
-  int       size = mp_count_bits (a);
+  int     size = mp_count_bits (a);
  return (size / 8 + ((size & 7) != 0 ? 1 : 0));
 }
--- a/bn_mp_xor.c
+++ b/bn_mp_xor.c
@ -18,8 +18,8 @@
 int
 mp_xor (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       res, ix, px;
-  mp_int    t, *x;
+  int     res, ix, px;
+  mp_int  t, *x;

  if (a->used > b->used) {
    if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
--- a/bn_radix.c
+++ b/bn_radix.c
@ -15,16 +15,15 @@
 #include <tommath.h>

 /* chars used in radix conversions */
-static const char *s_rmap =
-  "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";
+static const char *s_rmap = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+/";


 /* read a string [ASCII] in a given radix */
 int
 mp_read_radix (mp_int * a, char *str, int radix)
 {
-  int       y, res, neg;
-  char      ch;
+  int     y, res, neg;
+  char    ch;

  if (radix < 2 || radix > 64) {
    return MP_VAL;
@ -66,10 +65,10 @@ mp_read_radix (mp_int * a, char *str, int radix)
 int
 mp_toradix (mp_int * a, char *str, int radix)
 {
-  int       res, digs;
-  mp_int    t;
-  mp_digit  d;
-  char     *_s = str;
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;

  if (radix < 2 || radix > 64) {
    return MP_VAL;
@ -104,9 +103,9 @@ mp_toradix (mp_int * a, char *str, int radix)
 int
 mp_radix_size (mp_int * a, int radix)
 {
-  int       res, digs;
-  mp_int    t;
-  mp_digit  d;
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;

  /* special case for binary */
  if (radix == 2) {
--- a/bn_reverse.c
+++ b/bn_reverse.c
@ -18,7 +18,7 @@
 void
 bn_reverse (unsigned char *s, int len)
 {
-  int       ix, iy;
+  int     ix, iy;
  unsigned char t;

  ix = 0;
--- a/bn_s_mp_add.c
+++ b/bn_s_mp_add.c
@ -18,10 +18,8 @@
 int
 s_mp_add (mp_int * a, mp_int * b, mp_int * c)
 {
-  mp_int   *x;
-  int       olduse, res, min, max, i;
-  mp_digit  u;
-
+  mp_int *x;
+  int     olduse, res, min, max;

  /* find sizes, we let |a| <= |b| which means we have to sort
   * them.  "x" will point to the input with the most digits
@ -52,38 +50,48 @@ s_mp_add (mp_int * a, mp_int * b, mp_int * c)
  /* add digits from lower part */

  /* set the carry to zero */
-  u = 0;
-  for (i = 0; i < min; i++) {
-    /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-    c->dp[i] = a->dp[i] + b->dp[i] + u;
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;

-    /* U = carry bit of T[i] */
-    u = (c->dp[i] >> DIGIT_BIT) & 1;
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;

-    /* take away carry bit from T[i] */
-    c->dp[i] &= MP_MASK;
-  }
-
-  /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
-  if (min != max) {
-    for (; i < max; i++) {
-      /* T[i] = X[i] + U */
-      c->dp[i] = x->dp[i] + u;
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
+      *tmpc = *tmpa++ + *tmpb++ + u;

      /* U = carry bit of T[i] */
-      u = (c->dp[i] >> DIGIT_BIT) & 1;
+      u = *tmpc >> DIGIT_BIT;

      /* take away carry bit from T[i] */
-      c->dp[i] &= MP_MASK;
+      *tmpc++ &= MP_MASK;
    }
-  }

-  /* add carry */
-  c->dp[i] = u;
+    /* now copy higher words if any, that is in A+B if A or B has more digits add those in */
+    if (min != max) {
+      for (; i < max; i++) {
+	/* T[i] = X[i] + U */
+	*tmpc = x->dp[i] + u;

-  /* clear digits above used (since we may not have grown result above) */
-  for (i = c->used; i < olduse; i++) {
-    c->dp[i] = 0;
+	/* U = carry bit of T[i] */
+	u = *tmpc >> DIGIT_BIT;
+
+	/* take away carry bit from T[i] */
+	*tmpc++ &= MP_MASK;
+      }
+    }
+
+    /* add carry */
+    *tmpc++ = u;
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
  }

  mp_clamp (c);
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@ -21,23 +21,11 @@
 int
 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  mp_int    t;
-  int       res, pa, pb, ix, iy;
-  mp_digit  u;
-  mp_word   r;
-  mp_digit  tmpx, *tmpt, *tmpy;
-
-
-  /* can we use the fast multiplier? 
-   *
-   * The fast multiplier can be used if the output will have less than 
-   * 512 digits and the number of digits won't affect carry propagation
-   */
-  if ((digs < 512)
-      && digs < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_s_mp_mul_digs (a, b, c, digs);
-    return res;
-  }
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;

  if ((res = mp_init_size (&t, digs)) != MP_OKAY) {
    return res;
@ -61,9 +49,7 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
    /* compute the columns of the output and propagate the carry */
    for (iy = 0; iy < pb; iy++) {
      /* compute the column as a mp_word */
-      r =
-	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
-	((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);

      /* the new column is the lower part of the result */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@ -20,20 +20,17 @@
 int
 s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 {
-  mp_int    t;
-  int       res, pa, pb, ix, iy;
-  mp_digit  u;
-  mp_word   r;
-  mp_digit  tmpx, *tmpt, *tmpy;
+  mp_int  t;
+  int     res, pa, pb, ix, iy;
+  mp_digit u;
+  mp_word r;
+  mp_digit tmpx, *tmpt, *tmpy;


  /* can we use the fast multiplier? */
  if (((a->used + b->used + 1) < 512)
-      && MAX (a->used,
-	      b->used) <
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
-    res = fast_s_mp_mul_high_digs (a, b, c, digs);
-    return res;
+      && MAX (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
+    return fast_s_mp_mul_high_digs (a, b, c, digs);
  }

  if ((res = mp_init_size (&t, a->used + b->used + 1)) != MP_OKAY) {
@ -58,9 +55,7 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)

    for (iy = digs - ix; iy < pb; iy++) {
      /* calculate the double precision result */
-      r =
-	((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) +
-	((mp_word) u);
+      r = ((mp_word) * tmpt) + ((mp_word) tmpx) * ((mp_word) * tmpy++) + ((mp_word) u);

      /* get the lower part */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@ -18,18 +18,10 @@
 int
 s_mp_sqr (mp_int * a, mp_int * b)
 {
-  mp_int    t;
-  int       res, ix, iy, pa;
-  mp_word   r, u;
-  mp_digit  tmpx, *tmpt;
-
-  /* can we use the fast multiplier? */
-  if (((a->used * 2 + 1) < 512)
-      && a->used <
-      (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT) - 1))) {
-    res = fast_s_mp_sqr (a, b);
-    return res;
-  }
+  mp_int  t;
+  int     res, ix, iy, pa;
+  mp_word r, u;
+  mp_digit tmpx, *tmpt;

  pa = a->used;
  if ((res = mp_init_size (&t, pa + pa + 1)) != MP_OKAY) {
@ -40,9 +32,7 @@ s_mp_sqr (mp_int * a, mp_int * b)
  for (ix = 0; ix < pa; ix++) {
    /* first calculate the digit at 2*ix */
    /* calculate double precision result */
-    r =
-      ((mp_word) t.dp[ix + ix]) +
-      ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
+    r = ((mp_word) t.dp[ix + ix]) + ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);

    /* store lower part in result */
    t.dp[ix + ix] = (mp_digit) (r & ((mp_word) MP_MASK));
--- a/bn_s_mp_sub.c
+++ b/bn_s_mp_sub.c
@ -18,9 +18,7 @@
 int
 s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 {
-  int       olduse, res, min, max, i;
-  mp_digit  u;
-
+  int     olduse, res, min, max;

  /* find sizes */
  min = b->used;
@ -37,36 +35,48 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)

  /* sub digits from lower part */

-  /* set carry to zero */
-  u = 0;
-  for (i = 0; i < min; i++) {
-    /* T[i] = A[i] - B[i] - U */
-    c->dp[i] = a->dp[i] - (b->dp[i] + u);
+  {
+    register mp_digit u, *tmpa, *tmpb, *tmpc;
+    register int i;

-    /* U = carry bit of T[i] */
-    u = (c->dp[i] >> DIGIT_BIT) & 1;
+    /* alias for digit pointers */
+    tmpa = a->dp;
+    tmpb = b->dp;
+    tmpc = c->dp;

-    /* Clear carry from T[i] */
-    c->dp[i] &= MP_MASK;
-  }
+    /* set carry to zero */
+    u = 0;
+    for (i = 0; i < min; i++) {
+      /* T[i] = A[i] - B[i] - U */
+      *tmpc = *tmpa++ - *tmpb++ - u;

-  /* now copy higher words if any, e.g. if A has more digits than B  */
-  if (min != max) {
-    for (; i < max; i++) {
-      /* T[i] = A[i] - U */
-      c->dp[i] = a->dp[i] - u;
-
-      /* U = carry bit of T[i] */
-      u = (c->dp[i] >> DIGIT_BIT) & 1;
+      /* U = carry bit of T[i] 
+       * Note this saves performing an AND operation since 
+       * if a carry does occur it will propagate all the way to the
+       * MSB.  As a result a single shift is required to get the carry
+       */
+      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);

      /* Clear carry from T[i] */
-      c->dp[i] &= MP_MASK;
+      *tmpc++ &= MP_MASK;
    }
-  }

-  /* clear digits above used (since we may not have grown result above) */
-  for (i = c->used; i < olduse; i++) {
-    c->dp[i] = 0;
+    /* now copy higher words if any, e.g. if A has more digits than B  */
+    for (; i < max; i++) {
+      /* T[i] = A[i] - U */
+      *tmpc = *tmpa++ - u;
+
+      /* U = carry bit of T[i] */
+      u = *tmpc >> (CHAR_BIT * sizeof (mp_digit) - 1);
+
+      /* Clear carry from T[i] */
+      *tmpc++ &= MP_MASK;
+    }
+
+    /* clear digits above used (since we may not have grown result above) */
+    for (i = c->used; i < olduse; i++) {
+      *tmpc++ = 0;
+    }
  }

  mp_clamp (c);
--- a/bncore.c
+++ b/bncore.c
@ -14,6 +14,6 @@
 */
 #include <tommath.h>

-int       KARATSUBA_MUL_CUTOFF = 80,	/* Min. number of digits before Karatsuba multiplication is used. */
-          KARATSUBA_SQR_CUTOFF = 80,	/* Min. number of digits before Karatsuba squaring is used. */
-          MONTGOMERY_EXPT_CUTOFF = 40;	/* max. number of digits that montgomery reductions will help for */
+int     KARATSUBA_MUL_CUTOFF = 80,	/* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 80,	/* Min. number of digits before Karatsuba squaring is used. */
+        MONTGOMERY_EXPT_CUTOFF = 74;	/* max. number of digits that montgomery reductions will help for */
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,9 @@
+Feb 13th, 2003
+v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate 
+          to other functions like mp_invmod, mp_div, etc...
+       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
+       -- minor fixes
+
 Jan 17th, 2003
 v0.12  -- re-wrote the majority of the makefile so its more portable and will
          install via "make install" on most *nix platforms
--- a/demo/demo.c
+++ b/demo/demo.c
@ -76,7 +76,7 @@ int main(void)
 {
   mp_int a, b, c, d, e, f;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
-   int rr;
+   unsigned rr;

 #ifdef TIMER
   int n;
@ -90,42 +90,20 @@ int main(void)
   mp_init(&e);
   mp_init(&f);

-#ifdef DEBUG
-   mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
-   mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
-   mp_set(&c, 1);
-   reset_timings();
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   mp_exptmod(&c, &b, &a, &d);
-   dump_timings();
-   return 0;
-#endif

 #ifdef TIMER
+goto multtime;
+
      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
      while (a.used * DIGIT_BIT < 8192) {
         reset();
-         for (rr = 0; rr < 1000000; rr++) {
+         for (rr = 0; rr < 10000000; rr++) {
             mp_add(&a, &b, &c);
         }
         tt = rdtsc();
-         printf("Adding %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
         mp_sqr(&a, &a);
         mp_sqr(&b, &b);
      }
@ -134,37 +112,40 @@ int main(void)
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
      while (a.used * DIGIT_BIT < 8192) {
         reset();
-         for (rr = 0; rr < 1000000; rr++) {
+         for (rr = 0; rr < 10000000; rr++) {
             mp_sub(&a, &b, &c);
         }
         tt = rdtsc();
-         printf("Subtracting %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+         printf("Subtracting\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
         mp_sqr(&a, &a);
         mp_sqr(&b, &b);
      }
+      
+multtime:      

   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
   while (a.used * DIGIT_BIT < 8192) {
      reset();
-      for (rr = 0; rr < 1000000; rr++) {
+      for (rr = 0; rr < 250000; rr++) {
          mp_sqr(&a, &b);
      }
      tt = rdtsc();
-      printf("Squaring %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+      printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
      mp_copy(&b, &a);
   }
   
   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
   while (a.used * DIGIT_BIT < 8192) {
      reset();
-      for (rr = 0; rr < 1000000; rr++) {
+      for (rr = 0; rr < 250000; rr++) {
          mp_mul(&a, &a, &b);
      }
      tt = rdtsc();
-      printf("Multiplying %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)1000000));
+      printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
      mp_copy(&b, &a);
   }
-   
+
+expttime:  
   {
      char *primes[] = {
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -180,7 +161,7 @@ int main(void)
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
      for (rr = 0; rr < mp_count_bits(&a); rr++) {
-         mp_mul_2d(&b, 1, &b);
+         mp_mul_2(&b, &b);
         b.dp[0] |= lbit();
         b.used  += 1;
      }
@ -188,7 +169,7 @@ int main(void)
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
      reset();
-      for (rr = 0; rr < 100; rr++) {
+      for (rr = 0; rr < 50; rr++) {
          mp_exptmod(&c, &b, &a, &d);
      }
      tt = rdtsc();
@ -201,16 +182,15 @@ int main(void)
         draw(&d);
         exit(0);
      }
-      printf("Exponentiating %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
+      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
   }
   }   

-
   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
   mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
   while (a.used * DIGIT_BIT < 8192) {
      reset();
-      for (rr = 0; rr < 100; rr++) {
+      for (rr = 0; rr < 10000; rr++) {
          mp_invmod(&b, &a, &c);
      }
      tt = rdtsc();
@ -219,7 +199,7 @@ int main(void)
         printf("Failed to invert\n");
         return 0;
      }
-      printf("Inverting mod %d-bit took %f ticks\n", mp_count_bits(&a), (double)tt / ((double)100));
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((unsigned long long)rr)*CLOCKS_PER_SEC)/tt, tt);
      mp_sqr(&a, &a);
      mp_sqr(&b, &b);
   }
--- a/etc/makefile
+++ b/etc/makefile
@ -1,15 +1,20 @@
-CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
+CFLAGS += -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -I../
+
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a

 pprime: pprime.o
-	$(CC) pprime.o -ltommath -o pprime
+	$(CC) pprime.o $(LIBNAME) -o pprime

 tune: tune.o
-	$(CC) tune.o -ltommath -o tune
+	$(CC) tune.o $(LIBNAME) -o tune
        
 mersenne: mersenne.o
-	$(CC) mersenne.o -ltommath -o mersenne
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
        
 clean:
-	rm -f *.o pprime tune mersenne
-
-
+	rm -f *.o *.exe pprime tune mersenne 
--- a/etc/pprime.c
+++ b/etc/pprime.c
@ -5,7 +5,7 @@
 * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
 */
 #include <time.h>
-#include "bn.h"
+#include "tommath.h"

 /* fast square root */
 static    mp_digit
--- a/etc/tune.c
+++ b/etc/tune.c
@ -18,9 +18,9 @@ time_mult (void)

  t1 = clock ();
  for (x = 8; x <= 128; x += 8) {
-    mp_rand (&a, x);
-    mp_rand (&b, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 1000; y++) {
+      mp_rand (&a, x);
+      mp_rand (&b, x);
      mp_mul (&a, &b, &c);
    }
  }
@ -42,8 +42,8 @@ time_sqr (void)

  t1 = clock ();
  for (x = 8; x <= 128; x += 8) {
-    mp_rand (&a, x);
-    for (y = 0; y < 10000; y++) {
+    for (y = 0; y < 1000; y++) {
+      mp_rand (&a, x);
      mp_sqr (&a, &b);
    }
  }
--- a/gen.pl
+++ b/gen.pl
@ -0,0 +1,27 @@
+#!/usr/bin/perl
+#
+#Generates a "single file" you can use to quickly add the whole source 
+#without any makefile troubles
+#
+
+opendir(DIR,".");
+@files = readdir(DIR);
+closedir(DIR);
+
+open(OUT,">mpi.c");
+print OUT "/* File Generated Automatically by gen.pl */\n\n";
+for (@files) {
+   if ($_ =~ /\.c/ && !($_ =~ /mpi\.c/)) {
+      $fname = $_;
+      open(SRC,"<$fname");
+      print OUT "/* Start: $fname */\n";
+      while (<SRC>) {
+         print OUT $_;
+      }
+      close(SRC);
+      print OUT "\n/* End: $fname */\n\n";
+   }
+}
+print OUT "\n/* EOF */\n";
+close(OUT);
+   
--- a/ltmtest.exe
+++ b/ltmtest.exe
--- a/8
+++ b/8
@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops

-VERSION=0.12
+VERSION=0.13

 default: libtommath.a

@ -30,7 +30,7 @@ bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_mon
 bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
 bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
 bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o bn_radix.o \
-bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o

 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -60,8 +60,8 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.a etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest \
-        bn.log bn.aux bn.dvi *.log *.s
+	rm -f *.pdf *.o *.a *.exe etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        bn.log bn.aux bn.dvi *.log *.s mpi.c 
 	cd etc ; make clean

 zipup: clean docs
--- a/mpitest.exe
+++ b/mpitest.exe
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -89,7 +89,7 @@ int main(void)
   }

   for (;;) {
-       n = fgetc(rng) % 11;
+       n = 4; // fgetc(rng) % 11;

   if (n == 0) {
       /* add tests */
--- a/timings.txt
+++ b/timings.txt
@ -1,39 +1,36 @@
-CLOCKS_PER_SEC == 1000000
-Adding 128-bit took 0.070000 ticks
-Adding 256-bit took 0.100000 ticks
-Adding 512-bit took 0.140000 ticks
-Adding 1024-bit took 0.210000 ticks
-Adding 2048-bit took 0.360000 ticks
-Adding 4096-bit took 0.670000 ticks
-Subtracting 128-bit took 0.090000 ticks
-Subtracting 256-bit took 0.120000 ticks
-Subtracting 512-bit took 0.140000 ticks
-Subtracting 1024-bit took 0.210000 ticks
-Subtracting 2048-bit took 0.330000 ticks
-Subtracting 4096-bit took 0.580000 ticks
-Squaring 128-bit took 0.320000 ticks
-Squaring 256-bit took 0.620000 ticks
-Squaring 512-bit took 1.410000 ticks
-Squaring 1024-bit took 3.730000 ticks
-Squaring 2048-bit took 11.580000 ticks
-Squaring 4096-bit took 44.540000 ticks
-Multiplying 128-bit took 0.270000 ticks
-Multiplying 256-bit took 0.650000 ticks
-Multiplying 512-bit took 1.630000 ticks
-Multiplying 1024-bit took 5.180000 ticks
-Multiplying 2048-bit took 19.210000 ticks
-Multiplying 4096-bit took 67.500000 ticks
-Exponentiating 513-bit took 2000.000000 ticks
-Exponentiating 769-bit took 5200.000000 ticks
-Exponentiating 1025-bit took 11400.000000 ticks
-Exponentiating 2049-bit took 75100.000000 ticks
-Exponentiating 2561-bit took 150000.000000 ticks
-Exponentiating 3073-bit took 237800.000000 ticks
-Exponentiating 4097-bit took 510600.000000 ticks
-Inverting mod 128-bit took 0.000000 ticks
-Inverting mod 256-bit took 200.000000 ticks
-Inverting mod 512-bit took 300.000000 ticks
-Inverting mod 1024-bit took 800.000000 ticks
-Inverting mod 2048-bit took 2500.000000 ticks
-Inverting mod 4096-bit took 8400.000000 ticks
-
+CLOCKS_PER_SEC == 1000
+Adding           128-bit =>  14534883/sec,       688 ticks
+Adding           256-bit =>  11037527/sec,       906 ticks
+Adding           512-bit =>   8650519/sec,      1156 ticks
+Adding          1024-bit =>   5871990/sec,      1703 ticks
+Adding          2048-bit =>   3575259/sec,      2797 ticks
+Adding          4096-bit =>   2018978/sec,      4953 ticks
+Subtracting      128-bit =>  11025358/sec,       907 ticks
+Subtracting      256-bit =>   9149130/sec,      1093 ticks
+Subtracting      512-bit =>   7440476/sec,      1344 ticks
+Subtracting     1024-bit =>   5078720/sec,      1969 ticks
+Subtracting     2048-bit =>   3168567/sec,      3156 ticks
+Subtracting     4096-bit =>   1833852/sec,      5453 ticks
+Squaring         128-bit =>   3205128/sec,        78 ticks
+Squaring         256-bit =>   1592356/sec,       157 ticks
+Squaring         512-bit =>    696378/sec,       359 ticks
+Squaring        1024-bit =>    266808/sec,       937 ticks
+Squaring        2048-bit =>     85999/sec,      2907 ticks
+Squaring        4096-bit =>     21949/sec,     11390 ticks
+Multiplying      128-bit =>   3205128/sec,        78 ticks
+Multiplying      256-bit =>   1592356/sec,       157 ticks
+Multiplying      512-bit =>    615763/sec,       406 ticks
+Multiplying     1024-bit =>    192752/sec,      1297 ticks
+Multiplying     2048-bit =>     53510/sec,      4672 ticks
+Multiplying     4096-bit =>     14801/sec,     16890 ticks
+Exponentiating   513-bit =>       531/sec,        47 ticks
+Exponentiating   769-bit =>       177/sec,       141 ticks
+Exponentiating  1025-bit =>        88/sec,       282 ticks
+Exponentiating  2049-bit =>        13/sec,      1890 ticks
+Exponentiating  2561-bit =>         6/sec,      3812 ticks
+Exponentiating  3073-bit =>         4/sec,      6031 ticks
+Exponentiating  4097-bit =>         1/sec,     12843 ticks
+Inverting mod    128-bit =>     19160/sec,      5219 ticks
+Inverting mod    256-bit =>      8290/sec,     12062 ticks
+Inverting mod    512-bit =>      3565/sec,     28047 ticks
+Inverting mod   1024-bit =>      1305/sec,     76594 ticks
--- a/timings2.txt
+++ b/timings2.txt
@ -0,0 +1,36 @@
+CLOCKS_PER_SEC == 1000
+Adding           128-bit =>  15600624/sec,       641 ticks
+Adding           256-bit =>  12804097/sec,       781 ticks
+Adding           512-bit =>  10000000/sec,      1000 ticks
+Adding          1024-bit =>   7032348/sec,      1422 ticks
+Adding          2048-bit =>   4076640/sec,      2453 ticks
+Adding          4096-bit =>   2424242/sec,      4125 ticks
+Subtracting      128-bit =>  10845986/sec,       922 ticks
+Subtracting      256-bit =>   9416195/sec,      1062 ticks
+Subtracting      512-bit =>   7710100/sec,      1297 ticks
+Subtracting     1024-bit =>   5159958/sec,      1938 ticks
+Subtracting     2048-bit =>   3299241/sec,      3031 ticks
+Subtracting     4096-bit =>   1987676/sec,      5031 ticks
+Squaring         128-bit =>   3205128/sec,        78 ticks
+Squaring         256-bit =>   1592356/sec,       157 ticks
+Squaring         512-bit =>    696378/sec,       359 ticks
+Squaring        1024-bit =>    266524/sec,       938 ticks
+Squaring        2048-bit =>     86505/sec,      2890 ticks
+Squaring        4096-bit =>     22471/sec,     11125 ticks
+Multiplying      128-bit =>   3205128/sec,        78 ticks
+Multiplying      256-bit =>   1592356/sec,       157 ticks
+Multiplying      512-bit =>    615763/sec,       406 ticks
+Multiplying     1024-bit =>    190548/sec,      1312 ticks
+Multiplying     2048-bit =>     54418/sec,      4594 ticks
+Multiplying     4096-bit =>     14897/sec,     16781 ticks
+Exponentiating   513-bit =>       531/sec,        47 ticks
+Exponentiating   769-bit =>       177/sec,       141 ticks
+Exponentiating  1025-bit =>        84/sec,       297 ticks
+Exponentiating  2049-bit =>        13/sec,      1875 ticks
+Exponentiating  2561-bit =>         6/sec,      3766 ticks
+Exponentiating  3073-bit =>         4/sec,      6000 ticks
+Exponentiating  4097-bit =>         1/sec,     12750 ticks
+Inverting mod    128-bit =>     17301/sec,       578 ticks
+Inverting mod    256-bit =>      8103/sec,      1234 ticks
+Inverting mod    512-bit =>      3422/sec,      2922 ticks
+Inverting mod   1024-bit =>      1330/sec,      7516 ticks
--- a/timings3.txt
+++ b/timings3.txt
@ -0,0 +1,5 @@
+Exponentiating   513-bit =>       531/sec,        94 ticks
+Exponentiating   769-bit =>       187/sec,       266 ticks
+Exponentiating  1025-bit =>        88/sec,       562 ticks
+Exponentiating  2049-bit =>        13/sec,      3719 ticks
+
--- a/tommath.h
+++ b/tommath.h
@ -289,6 +289,11 @@ int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);

+/* computes a = B^n mod b without division or multiplication useful for 
+ * normalizing numbers in a Montgomery system.
+ */
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);

@ -343,6 +348,5 @@ void bn_reverse(unsigned char *s, int len);
   }
 #endif

-
 #endif