added libtommath-0.26

2003-08-29 14:06:56 +00:00 · 2003-08-29 14:06:56 +00:00 · 6e732340a9
commit 6e732340a9
parent c1da6aa2de
42 changed files with 725 additions and 224 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}

-\title{LibTomMath v0.25 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.26 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
--- a/bn_fast_mp_montgomery_reduce.c
+++ b/bn_fast_mp_montgomery_reduce.c
@ -45,7 +45,10 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
    register mp_word *_W;
    register mp_digit *tmpx;

-    _W = W;
+    /* alias for the W[] array */
+    _W   = W;
+
+    /* alias for the digits of  x*/
    tmpx = x->dp;

    /* copy the digits of a into W[0..a->used-1] */
--- a/bn_mp_add_d.c
+++ b/bn_mp_add_d.c
@ -42,7 +42,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     return res;
  }

-
  /* old number of used digits in c */
  oldused = c->used;

@ -76,7 +75,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     /* set final carry */
     ix++;
     *tmpc++  = mu;
-
  } else {
     /* a was negative and |a| < b */
     c->used  = 1;
--- a/bn_mp_cmp.c
+++ b/bn_mp_cmp.c
@ -19,12 +19,12 @@ int
 mp_cmp (mp_int * a, mp_int * b)
 {
  /* compare based on sign */
-  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
-    return MP_LT;
-  } 
-  
-  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
-    return MP_GT;
+  if (a->sign != b->sign) {
+     if (a->sign == MP_NEG) {
+        return MP_LT;
+     } else {
+        return MP_GT;
+     }
  }
  
  /* compare digits */
--- a/bn_mp_cmp_mag.c
+++ b/bn_mp_cmp_mag.c
@ -19,23 +19,30 @@ int
 mp_cmp_mag (mp_int * a, mp_int * b)
 {
  int     n;
+  mp_digit *tmpa, *tmpb;

  /* compare based on # of non-zero digits */
  if (a->used > b->used) {
    return MP_GT;
-  } 
+  }
  
  if (a->used < b->used) {
    return MP_LT;
  }

+  /* alias for a */
+  tmpa = a->dp + (a->used - 1);
+
+  /* alias for b */
+  tmpb = b->dp + (a->used - 1);
+
  /* compare based on digits  */
-  for (n = a->used - 1; n >= 0; n--) {
-    if (a->dp[n] > b->dp[n]) {
+  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+    if (*tmpa > *tmpb) {
      return MP_GT;
-    } 
-    
-    if (a->dp[n] < b->dp[n]) {
+    }
+
+    if (*tmpa < *tmpb) {
      return MP_LT;
    }
  }
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@ -111,8 +111,9 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)

  /* step 3. for i from n down to (t + 1) */
  for (i = n; i >= (t + 1); i--) {
-    if (i > x.used)
+    if (i > x.used) {
      continue;
+    }

    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
--- a/bn_mp_dr_reduce.c
+++ b/bn_mp_dr_reduce.c
@ -25,6 +25,8 @@
 * The modulus must be of a special format [see manual]
 *
 * Has been modified to use algorithm 7.10 from the LTM book instead
+ *
+ * Input x must be in the range 0 <= x <= (n-1)^2
 */
 int
 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
@ -63,10 +65,10 @@ top:
      *tmpx1++  = (mp_digit)(r & MP_MASK);
      mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
  }
-  
+
  /* set final carry */
  *tmpx1++ = mu;
-  
+
  /* zero words above m */
  for (i = m + 1; i < x->used; i++) {
      *tmpx1++ = 0;
--- a/bn_mp_init_multi.c
+++ b/bn_mp_init_multi.c
@ -49,4 +49,5 @@ int mp_init_multi(mp_int *mp, ...)
    }
    va_end(args);
    return res;                /* Assumed ok, if error flagged above. */
-}
+}
+
--- a/bn_mp_lcm.c
+++ b/bn_mp_lcm.c
@ -14,29 +14,42 @@
 */
 #include <tommath.h>

-/* computes least common multiple as a*b/(a, b) */
+/* computes least common multiple as |a*b|/(a, b) */
 int
 mp_lcm (mp_int * a, mp_int * b, mp_int * c)
 {
  int     res;
-  mp_int  t;
+  mp_int  t1, t2;


-  if ((res = mp_init (&t)) != MP_OKAY) {
+  if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
    return res;
  }

-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
+  /* t1 = get the GCD of the two inputs */
+  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
+    goto __T;
  }

-  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
+  /* divide the smallest by the GCD */
+  if (mp_cmp_mag(a, b) == MP_LT) {
+     /* store quotient in t2 such that t2 * b is the LCM */
+     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
+        goto __T;
+     }
+     res = mp_mul(b, &t2, c);
+  } else {
+     /* store quotient in t2 such that t2 * a is the LCM */
+     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
+        goto __T;
+     }
+     res = mp_mul(a, &t2, c);
  }

-  res = mp_div (&t, c, c, NULL);
-  mp_clear (&t);
+  /* fix the sign to positive */
+  c->sign = MP_ZPOS;
+
+__T:
+  mp_clear_multi (&t1, &t2, NULL);
  return res;
 }
--- a/bn_mp_montgomery_reduce.c
+++ b/bn_mp_montgomery_reduce.c
@ -43,7 +43,14 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
  x->used = digs;

  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * m' mod b */
+    /* mu = ai * rho mod b
+     *
+     * The value of rho must be precalculated via
+     * bn_mp_montgomery_setup() such that
+     * it equals -1/n0 mod b this allows the
+     * following inner loop to reduce the
+     * input one digit at a time
+     */
    mu = ((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK;

    /* a = a + mu * m * b**i */
@ -52,8 +59,10 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      register mp_digit *tmpn, *tmpx, u;
      register mp_word r;

-      /* aliases */
+      /* alias for digits of the modulus */
      tmpn = n->dp;
+
+      /* alias for the digits of x [the input] */
      tmpx = x->dp + ix;

      /* set the carry to zero */
@ -61,12 +70,20 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      
      /* Multiply and add in place */
      for (iy = 0; iy < n->used; iy++) {
+        /* compute product and sum */
        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
                  ((mp_word) u) + ((mp_word) * tmpx);
+
+        /* get carry */
        u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+        /* fix digit */
        *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
      }
-      /* propagate carries */
+      /* At this point the ix'th digit of x should be zero */
+
+
+      /* propagate carries upwards as required*/
      while (u) {
        *tmpx   += u;
        u        = *tmpx >> DIGIT_BIT;
@ -75,11 +92,18 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
    }
  }

+  /* at this point the n.used'th least
+   * significant digits of x are all zero
+   * which means we can shift x to the
+   * right by n.used digits and the
+   * residue is unchanged.
+   */
+
  /* x = x/b**n.used */
  mp_clamp(x);
  mp_rshd (x, n->used);

-  /* if A >= m then A = A - m */
+  /* if x >= n then x = x - n */
  if (mp_cmp_mag (x, n) != MP_LT) {
    return s_mp_sub (x, n, x);
  }
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@ -22,6 +22,8 @@ mp_neg (mp_int * a, mp_int * b)
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
-  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  if (mp_iszero(b) != 1) {
+     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  }
  return MP_OKAY;
 }
--- a/bn_mp_read_signed_bin.c
+++ b/bn_mp_read_signed_bin.c
@ -20,9 +20,17 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
 {
  int     res;

+  /* read magnitude */
  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
    return res;
  }
-  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
+
+  /* first byte is 0 for positive, non-zero for negative */
+  if (b[0] == 0) {
+     a->sign = MP_ZPOS;
+  } else {
+     a->sign = MP_NEG;
+  }
+
  return MP_OKAY;
 }
--- a/bn_mp_read_unsigned_bin.c
+++ b/bn_mp_read_unsigned_bin.c
@ -19,7 +19,18 @@ int
 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
 {
  int     res;
+
+  /* make sure there are at least two digits */
+  if (a->alloc < 2) {
+     if ((res = mp_grow(a, 2)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero the int */
  mp_zero (a);
+
+  /* read the bytes in */
  while (c-- > 0) {
    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
      return res;
--- a/bn_mp_set_int.c
+++ b/bn_mp_set_int.c
@ -21,6 +21,7 @@ mp_set_int (mp_int * a, unsigned int b)
  int     x, res;

  mp_zero (a);
+  
  /* set four bits at a time */
  for (x = 0; x < 8; x++) {
    /* shift the number up four bits */
--- a/bn_s_mp_mul_digs.c
+++ b/bn_s_mp_mul_digs.c
@ -61,15 +61,15 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
    /* compute the columns of the output and propagate the carry */
    for (iy = 0; iy < pb; iy++) {
      /* compute the column as a mp_word */
-      r = ((mp_word) *tmpt) + 
-          ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-          ((mp_word) u);
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);

      /* the new column is the lower part of the result */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* get the carry word from the result */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
    /* set carry if it is placed below digs */
    if (ix + iy < digs) {
--- a/bn_s_mp_mul_high_digs.c
+++ b/bn_s_mp_mul_high_digs.c
@ -26,7 +26,6 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  mp_word r;
  mp_digit tmpx, *tmpt, *tmpy;

-
  /* can we use the fast multiplier? */
  if (((a->used + b->used + 1) < MP_WARRAY)
      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
@ -55,13 +54,15 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)

    for (iy = digs - ix; iy < pb; iy++) {
      /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word) u);
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);

      /* get the lower part */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* carry the carry */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
    *tmpt = u;
  }
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@ -54,19 +54,19 @@ s_mp_sqr (mp_int * a, mp_int * b)
      /* now calculate the double precision result, note we use
       * addition instead of *2 since it's easier to optimize
       */
-      r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);

      /* store lower part */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* get carry */
-      u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
    }
    /* propagate upwards */
    while (u != ((mp_digit) 0)) {
-      r = ((mp_word) * tmpt) + ((mp_word) u);
+      r       = ((mp_word) *tmpt) + ((mp_word) u);
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
    }
  }

--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,13 @@
+Aug 29th, 2003
+v0.26  -- Fixed typo that caused warning with GCC 3.2
+       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.  
+          Also, Martin is the fellow who noted the bugs in mp_gcd() of 0.24/0.25.
+       -- Martin Marcel noticed an optimization [and slight bug] in mp_lcm().
+       -- Added fix to mp_read_unsigned_bin to prevent a buffer overflow.
+       -- Beefed up the comments in the baseline multipliers [and montgomery]
+       -- Added "mont" demo to the makefile.msvc in etc/
+       -- Optimized sign compares in mp_cmp from 4 to 2 cases.
+
 Aug 4th, 2003
 v0.25  -- Fix to mp_gcd again... oops (0,-a) == (-a, 0) == a
       -- Fix to mp_clear which didn't reset the sign  [Greg Rose]
--- a/changes.txt~
+++ b/changes.txt~
@ -0,0 +1,244 @@
+Aug 11th, 2003
+v0.26  -- Fixed typo that caused warning with GCC 3.2
+       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.  
+          Also, Martin is the fellow who noted the bugs in mp_gcd() of 0.24/0.25.
+       -- Martin Marcel noticed an optimization [and slight bug] in mp_lcm().
+       -- Added fix to mp_read_unsigned_bin to prevent a buffer overflow.
+       -- Beefed up the comments in the baseline multipliers [and montgomery]
+       -- Added "mont" demo to the makefile.msvc in etc/
+       -- Optimized sign compares in mp_cmp from 4 to 2 cases.
+
+Aug 4th, 2003
+v0.25  -- Fix to mp_gcd again... oops (0,-a) == (-a, 0) == a
+       -- Fix to mp_clear which didn't reset the sign  [Greg Rose]
+       -- Added mp_error_to_string() to convert return codes to strings.  [Greg Rose]
+       -- Optimized fast_mp_invmod() to do the test for invalid inputs [both even]
+          first so temps don't have to be initialized if it's going to fail.
+       -- Optimized mp_gcd() by removing mp_div_2d calls for when one of the inputs
+          is odd.
+       -- Tons of new comments, some indentation fixups, etc.
+       -- mp_jacobi() returns MP_VAL if the modulus is less than or equal to zero.
+       -- fixed two typos in the header of each file :-)
+       -- LibTomMath is officially Public Domain [see LICENSE]
+
+July 15th, 2003
+v0.24  -- Optimized mp_add_d and mp_sub_d to not allocate temporary variables
+       -- Fixed mp_gcd() so the gcd of 0,0 is 0.  Allows the gcd operation to be chained
+          e.g. (0,0,a) == a [instead of 1]
+       -- Should be one of the last release for a while.  Working on LibTomMath book now.
+       -- optimized the pprime demo [/etc/pprime.c] to first make a huge table of single
+          digit primes then it reads them randomly instead of randomly choosing/testing single
+          digit primes.
+
+July 12th, 2003
+v0.23  -- Optimized mp_prime_next_prime() to not use mp_mod [via is_divisible()] in each
+          iteration.  Instead now a smaller table is kept of the residues which can be updated
+          without division.
+       -- Fixed a bug in next_prime() where an input of zero would be treated as odd and
+          have two added to it [to move to the next odd].
+       -- fixed a bug in prime_fermat() and prime_miller_rabin() which allowed the base
+          to be negative, zero or one.  Normally the test is only valid if the base is
+          greater than one.
+       -- changed the next_prime() prototype to accept a new parameter "bbs_style" which
+          will find the next prime congruent to 3 mod 4.  The default [bbs_style==0] will
+          make primes which are either congruent to 1 or 3 mod 4.
+       -- fixed mp_read_unsigned_bin() so that it doesn't include both code for
+          the case DIGIT_BIT < 8 and >= 8
+       -- optimized div_d() to easy out on division by 1 [or if a == 0] and use
+          logical shifts if the divisor is a power of two.
+       -- the default DIGIT_BIT type was not int for non-default builds.  Fixed.
+
+July 2nd, 2003
+v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always congruent to the inverse...]
+       -- Fixed up s_mp_exptmod and mp_exptmod_fast so the lower half of the pre-computed table isn't allocated
+          which makes the algorithm use half as much ram.
+       -- Fixed the install script not to make the book :-) [which isn't included anyways]
+       -- added mp_cnt_lsb() which counts how many of the lsbs are zero
+       -- optimized mp_gcd() to use the new mp_cnt_lsb() to replace multiple divisions by two by a single division.
+       -- applied similar optimization to mp_prime_miller_rabin().
+       -- Fixed a bug in both mp_invmod() and fast_mp_invmod() which tested for odd
+          via "mp_iseven() == 0" which is not valid [since zero is not even either].
+
+June 19th, 2003
+v0.21  -- Fixed bug in mp_mul_d which would not handle sign correctly [would not always forward it]
+       -- Removed the #line lines from gen.pl [was in violation of ISO C]       
+
+June 8th, 2003
+v0.20  -- Removed the book from the package.  Added the TDCAL license document.  
+       -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16]
+
+June 6th, 2003
+v0.19  -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release.
+          Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time.
+       -- Fixed up etc/tune.c a bit to stop testing new cutoffs after 16 failures [to find more optimal points]. 
+          Brute force ho!
+          
+
+May 29th, 2003
+v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
+          (e.g. correct result, just bad looking code)
+       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
+       -- Added Toom-Cook multipliers [needs tuning!]
+       -- Added efficient divide by 3 algorithm mp_div_3
+       -- Re-wrote mp_div_d to be faster than calling mp_div 
+       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
+       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
+       -- Made the exptmod system be aware of the 2k reduction algorithms.
+       -- Rewrote mp_dr_reduce to be smaller, simpler and easier to understand.
+
+May 17th, 2003
+v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A new gen.pl as well
+          as several smaller suggestions.  Thanks!
+       -- removed call to mp_cmp in inner loop of mp_div and put mp_cmp_mag in its place :-)
+       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
+       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
+       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
+       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the 
+          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed) 
+       -- Refactored the DR reduction code so there is only one function per file.
+       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
+          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
+       -- Fixed similar bug in the Montgomery reduction code.
+       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. 
+          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
+       -- Added new comments here and there, cleaned up some code [style stuff]
+       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
+       -- Fixed up mp_exptmod_fast so it would set "redux" to the comba Montgomery reduction if allowed.  This
+          saves quite a few calls and if statements.
+       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
+       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
+          uses RDTSC for high precision timing.  
+       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its 
+          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
+       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files 
+          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the 
+          root of the package.
+       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].  
+       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
+       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
+          shifting caused a carry.  
+       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
+          slower than it had to be.
+          
+Mar 29th, 2003
+v0.16  -- Sped up mp_div by making normalization one shift call
+       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
+       -- Cleaned up mp_gcd to use the macros for odd/even detection
+       -- Added comments here and there, mostly there but occasionally here too.
+
+Mar 22nd, 2003
+v0.15  -- Added series of prime testing routines to lib
+       -- Fixed up etc/tune.c
+       -- Added DR reduction algorithm
+       -- Beefed up the manual more.
+       -- Fixed up demo/demo.c so it doesn't have so many warnings and it does the full series of
+          tests
+       -- Added "pre-gen" directory which will hold a "gen.pl"'ed copy of the entire lib [done at
+          zipup time so its always the latest]
+       -- Added conditional casts for C++ users [boo!]
+
+Mar 15th, 2003
+v0.14  -- Tons of manual updates
+       -- cleaned up the directory
+       -- added MSVC makefiles
+       -- source changes [that I don't recall]
+       -- Fixed up the lshd/rshd code to use pointer aliasing
+       -- Fixed up the mul_2d and div_2d to not call rshd/lshd unless needed
+       -- Fixed up etc/tune.c a tad
+       -- fixed up demo/demo.c to output comma-delimited results of timing
+          also fixed up timing demo to use a finer granularity for various functions
+       -- fixed up demo/demo.c testing to pause during testing so my Duron won't catch on fire
+          [stays around 31-35C during testing :-)]
+       
+Feb 13th, 2003
+v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate 
+          to other functions like mp_invmod, mp_div, etc...
+       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
+       -- minor fixes
+
+Jan 17th, 2003
+v0.12  -- re-wrote the majority of the makefile so its more portable and will
+          install via "make install" on most *nix platforms
+       -- Re-packaged all the source as seperate files.  Means the library a single
+          file packagage any more.  Instead of just adding "bn.c" you have to add
+          libtommath.a
+       -- Renamed "bn.h" to "tommath.h"
+       -- Changes to the manual to reflect all of this
+       -- Used GNU Indent to clean up the source
+
+Jan 15th, 2003
+v0.11  -- More subtle fixes
+       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
+       -- Sped up the montgomery reduction code quite a bit
+       -- fixed up demo so when building timing for the x86 it assumes ELF format now
+       
+Jan 9th, 2003
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
+       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
+          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
+       
+Jan 6th, 2003
+v0.09  -- Updated the manual to reflect recent changes.  :-)
+       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
+       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
+
+Jan 2nd, 2003
+v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
+       -- Corrected a bunch of small "warnings"
+       -- Added more comments
+       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
+       -- Corrected some bugs where error messages were potentially ignored
+       -- add etc/pprime.c program which makes numbers which are provably prime.
+       
+Jan 1st, 2003
+v0.07  -- Removed alot of heap operations from core functions to speed them up
+       -- Added a root finding function [and mp_sqrt macro like from MPI]
+       -- Added more to manual 
+
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+       
+Dec 30th, 2002
+v0.05  -- Builds with MSVC out of the box
+       -- Fixed a bug in mp_invmod w.r.t. even moduli
+       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
+       -- Fixed up exptmod to use fewer multiplications
+       -- Fixed up mp_init_size to use only one heap operation
+          -- Note there is a slight "off-by-one" bug in the library somewhere
+             without the padding (see the source for comment) the library 
+             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
+             numbers which will always correct it since as the numbers grow the padding
+             will still be beyond the end of the number
+       -- Added more to the manual
+       
+Dec 29th, 2002
+v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
+       -- optimized invmod code
+       -- Fixed bug in mp_div
+       -- use exchange instead of copy for results
+       -- added a bit more to the manual
+       
+Dec 27th, 2002
+v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
+       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
+       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
+       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
+       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
+       -- mp_mul_d didn't preserve sign
+       -- Many many many many fixes
+       -- Works in LibTomCrypt now :-)
+       -- Added iterations to the timing demos... more accurate.
+       -- Tom needs a job.       
+
+Dec 26th, 2002
+v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
+       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
+       -- Sped up the fast [comba] multipliers more [yahoo!]
+
+Dec 25th,2002
+v0.01  -- Initial release.  Gimme a break.
+       -- Todo list, 
+           add details to manual [e.g. algorithms]
+           more comments in code
+           example programs
--- a/demo/demo.c
+++ b/demo/demo.c
@ -1,5 +1,12 @@
 #include <time.h>

+#ifdef IOWNANATHLON
+#include <unistd.h>
+#define SLEEP sleep(4)
+#else
+#define SLEEP
+#endif
+
 #include "tommath.h"

 #ifdef TIMER
@ -185,6 +192,7 @@ int main(void)

      log = fopen("logs/add.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
@ -201,11 +209,12 @@ int main(void)

      log = fopen("logs/sub.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
         rr = 0;
-         do { 
+         do {
            DO(mp_sub(&a,&b,&c));
            rr += 16;
         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
@ -226,6 +235,7 @@ int main(void)

      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 16) {
+         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
@ -242,6 +252,7 @@ int main(void)

      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 16) {
+         SLEEP;
         mp_rand(&a, cnt);
         reset();
         rr = 0;
@ -265,7 +276,7 @@ int main(void)
         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
-         
+
         /* DR moduli */
         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
@ -289,6 +300,7 @@ int main(void)
   logb = fopen("logs/expt_dr.log", "w");
   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
+      SLEEP;
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
      for (rr = 0; rr < mp_count_bits(&a); rr++) {
@ -325,6 +337,7 @@ int main(void)

   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
+      SLEEP;
      mp_rand(&a, cnt);
      mp_rand(&b, cnt);

--- a/etc/makefile.msvc
+++ b/etc/makefile.msvc
@ -12,6 +12,9 @@ mersenne: mersenne.obj
 	
 tune: tune.obj
 	cl tune.obj ../tommath.lib
+
+mont: mont.obj
+	cl mont.obj ../tommath.lib
 	
 drprime: drprime.obj
 	cl drprime.obj ../tommath.lib
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224  12444616
-448  10412040
-672   8825112
-896   7519080
-1120   6428432
-1344   5794784
-1568   5242952
-1792   4737008
-2016   4434104
-2240   4132912
-2464   3827752
-2688   3589672
-2912   3350176
-3136   3180208
-3360   3014160
-3584   2847672
+224  12849536
+448   9956080
+672   8372000
+896   7065464
+1120   5824864
+1344   5141728
+1568   4511808
+1792   4170480
+2016   3802536
+2240   3500936
+2464   3193352
+2688   2991976
+2912   2818672
+3136   2661448
+3360   2506560
+3584   2343304
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -1,7 +1,7 @@
-513       680
-769       257
-1025       117
-2049        17
-2561         9
-3073         5
+513       600
+769       221
+1025       103
+2049        15
+2561         8
+3073         4
 4097         2
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -1,6 +1,6 @@
-521       736
-607       552
-1279       112
-2203        33
-3217        13
-4253         6
+521       728
+607       549
+1279       100
+2203        29
+3217        11
+4253         5
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -1,7 +1,7 @@
-532      1064
-784       460
-1036       240
-1540        91
-2072        43
-3080        15
-4116         6
+532      1032
+784       424
+1036       214
+1540        81
+2072        38
+3080        13
+4116         5
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +1,32 @@
-112     16248
-224      8192
-336      5320
-448      3560
-560      2728
-672      2064
-784      1704
-896      2176
-1008      1184
-1120       976
-1232      1280
-1344      1176
-1456       624
-1568       912
-1680       504
-1792       452
-1904       658
-2016       608
-2128       336
-2240       312
-2352       288
-2464       264
-2576       408
-2688       376
-2800       354
-2912       198
-3024       307
-3136       173
-3248       162
-3360       256
-3472       145
-3584       226
+112     14936
+224      7208
+336      6864
+448      5000
+560      3648
+672      1832
+784      1480
+896      1232
+1008      1010
+1120      1360
+1232       728
+1344       632
+1456       544
+1568       800
+1680       704
+1792       396
+1904       584
+2016       528
+2128       483
+2240       448
+2352       250
+2464       378
+2576       350
+2688       198
+2800       300
+2912       170
+3024       265
+3136       150
+3248       142
+3360       134
+3472       126
+3584       118
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/mult.log
+++ b/logs/mult.log
@ -0,0 +1,17 @@
+896    301008
+1344    141872
+1792     84424
+2240     55864
+2688     39784
+3136     29624
+3584     22952
+4032     18304
+4480     14944
+4928     12432
+5376     10496
+5824      8976
+6272      7776
+6720      6792
+7168      1656
+7616      1472
+8064      1312
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,17 +1,17 @@
-896    322872
-1344    151688
-1792     90480
-2240     59984
-2688     42656
-3136     32144
-3584     25840
-4032     21328
-4480     17856
-4928     14928
-5376     12856
-5824     11256
-6272      9880
-6720      8984
-7168      7928
-7616      7200
-8064      6576
+896    301784
+1344    141568
+1792     84592
+2240     55864
+2688     39576
+3136     30088
+3584     24032
+4032     19760
+4480     16536
+4928     13376
+5376     11880
+5824     10256
+6272      9160
+6720      8208
+7168      7384
+7616      6664
+8064      6112
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -0,0 +1,17 @@
+896    371856
+1344    196352
+1792    122312
+2240     83144
+2688     60304
+3136     45832
+3584     12760
+4032     10160
+4480      8352
+4928      6944
+5376      5824
+5824      5008
+6272      4336
+6720      3768
+7168      3280
+7616      2952
+8064      2640
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,17 +1,17 @@
-896    420464
-1344    224800
-1792    142808
-2240     97704
-2688     71416
-3136     54504
-3584     38320
-4032     32360
-4480     27576
-4928     23840
-5376     20688
-5824     18264
-6272     16176
-6720     14440
-7168     11688
-7616     10752
-8064      9936
+896    372256
+1344    196368
+1792    122272
+2240     82976
+2688     60480
+3136     45808
+3584     33296
+4032     27888
+4480     23608
+4928     20296
+5376     17576
+5824     15416
+6272     13600
+6720     12104
+7168     10080
+7616      9232
+8064      8008
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224  10876088
-448   9103552
-672   7823536
-896   6724960
-1120   5993496
-1344   5278984
-1568   4947736
-1792   4478384
-2016   4108840
-2240   3838696
-2464   3604128
-2688   3402192
-2912   3166568
-3136   3090672
-3360   2946720
-3584   2781288
+224   9325944
+448   8075808
+672   7054912
+896   5757992
+1120   5081768
+1344   4669384
+1568   4422384
+1792   3900416
+2016   3548872
+2240   3428912
+2464   3216968
+2688   2905280
+2912   2782664
+3136   2591440
+3360   2475728
+3584   2282216
--- a/8
+++ b/8
@ -6,7 +6,7 @@ CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer 

-VERSION=0.25
+VERSION=0.26

 default: libtommath.a

@ -81,12 +81,6 @@ poster: poster.tex
 docs:	
 	cd pics ; make pdfes
 	echo "hello" > tommath.ind
-	perl booker.pl
-	latex tommath > /dev/null
-	makeindex tommath
-	latex tommath > /dev/null
-	dvips -tB5 -D600 tommath
-	echo "hello" > tommath.ind
 	perl booker.pl PDF
 	latex tommath > /dev/null
 	makeindex tommath
--- a/mtest/test.c~
+++ b/mtest/test.c~
@ -0,0 +1,23 @@
+#include <stdio.h>
+#include "mpi.c"
+
+int main(void)
+{
+   mp_int a, b;
+   int ix;
+   char buf[1024];
+
+   mp_init(&a);
+   mp_init(&b);
+
+   mp_set(&a, 0x1B);
+   mp_neg(&a, &a);
+   ix = 0;
+   mp_add_d(&a, ix, &b);
+
+   mp_toradix(&b, buf, 64);
+   printf("b == %s\n", buf);
+   return 0;
+}
+
+   
--- a/pics/makefile
+++ b/pics/makefile
@ -3,16 +3,16 @@
 default:  pses

 design_process.ps: design_process.tif
-	tiff2ps -c -e design_process.tif > design_process.ps
+	tiff2ps -s -e design_process.tif > design_process.ps

 sliding_window.ps: sliding_window.tif
-	tiff2ps -c -e sliding_window.tif > sliding_window.ps
+	tiff2ps -s -e sliding_window.tif > sliding_window.ps
 	
 expt_state.ps: expt_state.tif
-	tiff2ps -c -e expt_state.tif > expt_state.ps
+	tiff2ps -s -e expt_state.tif > expt_state.ps

 primality.ps: primality.tif
-	tiff2ps -c -e primality.tif > primality.ps
+	tiff2ps -s -e primality.tif > primality.ps

 design_process.pdf: design_process.ps
 	epstopdf design_process.ps
--- a/pics/makefile~
+++ b/pics/makefile~
@ -0,0 +1,35 @@
+# makes the images... yeah
+
+default:  pses
+
+design_process.ps: design_process.tif
+	tiff2ps -s -e design_process.tif > design_process.ps
+
+sliding_window.ps: sliding_window.tif
+	tiff2ps -e sliding_window.tif > sliding_window.ps
+	
+expt_state.ps: expt_state.tif
+	tiff2ps -e expt_state.tif > expt_state.ps
+
+primality.ps: primality.tif
+	tiff2ps -e primality.tif > primality.ps
+
+design_process.pdf: design_process.ps
+	epstopdf design_process.ps
+
+sliding_window.pdf: sliding_window.ps
+	epstopdf sliding_window.ps
+	
+expt_state.pdf: expt_state.ps
+	epstopdf expt_state.ps
+
+primality.pdf: primality.ps
+	epstopdf primality.ps
+
+
+pses: sliding_window.ps expt_state.ps primality.ps design_process.ps
+pdfes: sliding_window.pdf expt_state.pdf primality.pdf design_process.pdf
+
+clean:
+	rm -rf *.ps *.pdf .xvpics
+   
--- a/poster.pdf
+++ b/poster.pdf
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -241,7 +241,10 @@ fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
    register mp_word *_W;
    register mp_digit *tmpx;

-    _W = W;
+    /* alias for the W[] array */
+    _W   = W;
+
+    /* alias for the digits of  x*/
    tmpx = x->dp;

    /* copy the digits of a into W[0..a->used-1] */
@ -925,7 +928,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     return res;
  }

-
  /* old number of used digits in c */
  oldused = c->used;

@ -959,7 +961,6 @@ mp_add_d (mp_int * a, mp_digit b, mp_int * c)
     /* set final carry */
     ix++;
     *tmpc++  = mu;
-
  } else {
     /* a was negative and |a| < b */
     c->used  = 1;
@ -1217,12 +1218,12 @@ int
 mp_cmp (mp_int * a, mp_int * b)
 {
  /* compare based on sign */
-  if (a->sign == MP_NEG && b->sign == MP_ZPOS) {
-    return MP_LT;
-  } 
-  
-  if (a->sign == MP_ZPOS && b->sign == MP_NEG) {
-    return MP_GT;
+  if (a->sign != b->sign) {
+     if (a->sign == MP_NEG) {
+        return MP_LT;
+     } else {
+        return MP_GT;
+     }
  }
  
  /* compare digits */
@ -1301,23 +1302,30 @@ int
 mp_cmp_mag (mp_int * a, mp_int * b)
 {
  int     n;
+  mp_digit *tmpa, *tmpb;

  /* compare based on # of non-zero digits */
  if (a->used > b->used) {
    return MP_GT;
-  } 
+  }
  
  if (a->used < b->used) {
    return MP_LT;
  }

+  /* alias for a */
+  tmpa = a->dp + (a->used - 1);
+
+  /* alias for b */
+  tmpb = b->dp + (a->used - 1);
+
  /* compare based on digits  */
-  for (n = a->used - 1; n >= 0; n--) {
-    if (a->dp[n] > b->dp[n]) {
+  for (n = 0; n < a->used; ++n, --tmpa, --tmpb) {
+    if (*tmpa > *tmpb) {
      return MP_GT;
-    } 
-    
-    if (a->dp[n] < b->dp[n]) {
+    }
+
+    if (*tmpa < *tmpb) {
      return MP_LT;
    }
  }
@ -1594,8 +1602,9 @@ mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)

  /* step 3. for i from n down to (t + 1) */
  for (i = n; i >= (t + 1); i--) {
-    if (i > x.used)
+    if (i > x.used) {
      continue;
+    }

    /* step 3.1 if xi == yt then set q{i-t-1} to b-1, 
     * otherwise set q{i-t-1} to (xi*b + x{i-1})/yt */
@ -2111,6 +2120,8 @@ int mp_dr_is_modulus(mp_int *a)
 * The modulus must be of a special format [see manual]
 *
 * Has been modified to use algorithm 7.10 from the LTM book instead
+ *
+ * Input x must be in the range 0 <= x <= (n-1)^2
 */
 int
 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
@ -2149,10 +2160,10 @@ top:
      *tmpx1++  = (mp_digit)(r & MP_MASK);
      mu        = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
  }
-  
+
  /* set final carry */
  *tmpx1++ = mu;
-  
+
  /* zero words above m */
  for (i = m + 1; i < x->used; i++) {
      *tmpx1++ = 0;
@ -3060,6 +3071,8 @@ int mp_init_multi(mp_int *mp, ...)
    va_end(args);
    return res;                /* Assumed ok, if error flagged above. */
 }
+
+
 /* End: bn_mp_init_multi.c */

 /* Start: bn_mp_init_size.c */
@ -3689,30 +3702,43 @@ ERR:
 */
 #include <tommath.h>

-/* computes least common multiple as a*b/(a, b) */
+/* computes least common multiple as |a*b|/(a, b) */
 int
 mp_lcm (mp_int * a, mp_int * b, mp_int * c)
 {
  int     res;
-  mp_int  t;
+  mp_int  t1, t2;


-  if ((res = mp_init (&t)) != MP_OKAY) {
+  if ((res = mp_init_multi (&t1, &t2, NULL)) != MP_OKAY) {
    return res;
  }

-  if ((res = mp_mul (a, b, &t)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
+  /* t1 = get the GCD of the two inputs */
+  if ((res = mp_gcd (a, b, &t1)) != MP_OKAY) {
+    goto __T;
  }

-  if ((res = mp_gcd (a, b, c)) != MP_OKAY) {
-    mp_clear (&t);
-    return res;
+  /* divide the smallest by the GCD */
+  if (mp_cmp_mag(a, b) == MP_LT) {
+     /* store quotient in t2 such that t2 * b is the LCM */
+     if ((res = mp_div(a, &t1, &t2, NULL)) != MP_OKAY) {
+        goto __T;
+     }
+     res = mp_mul(b, &t2, c);
+  } else {
+     /* store quotient in t2 such that t2 * a is the LCM */
+     if ((res = mp_div(b, &t1, &t2, NULL)) != MP_OKAY) {
+        goto __T;
+     }
+     res = mp_mul(a, &t2, c);
  }

-  res = mp_div (&t, c, c, NULL);
-  mp_clear (&t);
+  /* fix the sign to positive */
+  c->sign = MP_ZPOS;
+
+__T:
+  mp_clear_multi (&t1, &t2, NULL);
  return res;
 }

@ -4012,7 +4038,14 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
  x->used = digs;

  for (ix = 0; ix < n->used; ix++) {
-    /* mu = ai * m' mod b */
+    /* mu = ai * rho mod b
+     *
+     * The value of rho must be precalculated via
+     * bn_mp_montgomery_setup() such that
+     * it equals -1/n0 mod b this allows the
+     * following inner loop to reduce the
+     * input one digit at a time
+     */
    mu = ((mp_word)x->dp[ix]) * ((mp_word)rho) & MP_MASK;

    /* a = a + mu * m * b**i */
@ -4021,8 +4054,10 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      register mp_digit *tmpn, *tmpx, u;
      register mp_word r;

-      /* aliases */
+      /* alias for digits of the modulus */
      tmpn = n->dp;
+
+      /* alias for the digits of x [the input] */
      tmpx = x->dp + ix;

      /* set the carry to zero */
@ -4030,12 +4065,20 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
      
      /* Multiply and add in place */
      for (iy = 0; iy < n->used; iy++) {
+        /* compute product and sum */
        r       = ((mp_word)mu) * ((mp_word)*tmpn++) +
                  ((mp_word) u) + ((mp_word) * tmpx);
+
+        /* get carry */
        u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+
+        /* fix digit */
        *tmpx++ = (mp_digit)(r & ((mp_word) MP_MASK));
      }
-      /* propagate carries */
+      /* At this point the ix'th digit of x should be zero */
+
+
+      /* propagate carries upwards as required*/
      while (u) {
        *tmpx   += u;
        u        = *tmpx >> DIGIT_BIT;
@ -4044,11 +4087,18 @@ mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
    }
  }

+  /* at this point the n.used'th least
+   * significant digits of x are all zero
+   * which means we can shift x to the
+   * right by n.used digits and the
+   * residue is unchanged.
+   */
+
  /* x = x/b**n.used */
  mp_clamp(x);
  mp_rshd (x, n->used);

-  /* if A >= m then A = A - m */
+  /* if x >= n then x = x - n */
  if (mp_cmp_mag (x, n) != MP_LT) {
    return s_mp_sub (x, n, x);
  }
@ -4606,7 +4656,9 @@ mp_neg (mp_int * a, mp_int * b)
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
-  b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  if (mp_iszero(b) != 1) {
+     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
+  }
  return MP_OKAY;
 }

@ -5360,10 +5412,18 @@ mp_read_signed_bin (mp_int * a, unsigned char *b, int c)
 {
  int     res;

+  /* read magnitude */
  if ((res = mp_read_unsigned_bin (a, b + 1, c - 1)) != MP_OKAY) {
    return res;
  }
-  a->sign = ((b[0] == (unsigned char) 0) ? MP_ZPOS : MP_NEG);
+
+  /* first byte is 0 for positive, non-zero for negative */
+  if (b[0] == 0) {
+     a->sign = MP_ZPOS;
+  } else {
+     a->sign = MP_NEG;
+  }
+
  return MP_OKAY;
 }

@ -5391,7 +5451,18 @@ int
 mp_read_unsigned_bin (mp_int * a, unsigned char *b, int c)
 {
  int     res;
+
+  /* make sure there are at least two digits */
+  if (a->alloc < 2) {
+     if ((res = mp_grow(a, 2)) != MP_OKAY) {
+        return res;
+     }
+  }
+
+  /* zero the int */
  mp_zero (a);
+
+  /* read the bytes in */
  while (c-- > 0) {
    if ((res = mp_mul_2d (a, 8, a)) != MP_OKAY) {
      return res;
@ -5804,6 +5875,7 @@ mp_set_int (mp_int * a, unsigned int b)
  int     x, res;

  mp_zero (a);
+  
  /* set four bits at a time */
  for (x = 0; x < 8; x++) {
    /* shift the number up four bits */
@ -7414,15 +7486,15 @@ s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
    /* compute the columns of the output and propagate the carry */
    for (iy = 0; iy < pb; iy++) {
      /* compute the column as a mp_word */
-      r = ((mp_word) *tmpt) + 
-          ((mp_word)tmpx) * ((mp_word)*tmpy++) +
-          ((mp_word) u);
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);

      /* the new column is the lower part of the result */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* get the carry word from the result */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
    /* set carry if it is placed below digs */
    if (ix + iy < digs) {
@ -7468,7 +7540,6 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  mp_word r;
  mp_digit tmpx, *tmpt, *tmpy;

-
  /* can we use the fast multiplier? */
  if (((a->used + b->used + 1) < MP_WARRAY)
      && MIN (a->used, b->used) < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
@ -7497,13 +7568,15 @@ s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)

    for (iy = digs - ix; iy < pb; iy++) {
      /* calculate the double precision result */
-      r = ((mp_word) * tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word) u);
+      r       = ((mp_word)*tmpt) +
+                ((mp_word)tmpx) * ((mp_word)*tmpy++) +
+                ((mp_word) u);

      /* get the lower part */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* carry the carry */
-      u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
    }
    *tmpt = u;
  }
@ -7572,19 +7645,19 @@ s_mp_sqr (mp_int * a, mp_int * b)
      /* now calculate the double precision result, note we use
       * addition instead of *2 since it's easier to optimize
       */
-      r = ((mp_word) *tmpt) + r + r + ((mp_word) u);
+      r       = ((mp_word) *tmpt) + r + r + ((mp_word) u);

      /* store lower part */
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));

      /* get carry */
-      u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
    }
    /* propagate upwards */
    while (u != ((mp_digit) 0)) {
-      r = ((mp_word) * tmpt) + ((mp_word) u);
+      r       = ((mp_word) *tmpt) + ((mp_word) u);
      *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-      u = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
+      u       = (mp_digit)(r >> ((mp_word) DIGIT_BIT));
    }
  }