added libtommath-0.08

2003-02-28 16:06:22 +00:00 · 2003-02-28 16:06:22 +00:00 · 2cfbb89142
commit 2cfbb89142
parent 3cd7000342
11 changed files with 656 additions and 116 deletions
--- a/bn.c
+++ b/bn.c
@ -25,18 +25,117 @@ static const char *s_rmap =
 #ifdef DEBUG
-static char *_funcs[1000];
+/* timing data */
-int _ifuncs;
+#ifdef TIMER_X86
 extern ulong64 gettsc(void);
 #else
 ulong64 gettsc(void) { return clock(); }
 #endif
-#define REGFUNC(name) { if (_ifuncs == 999) { printf("TROUBLE\n"); exit(0); } _funcs[_ifuncs++] = name; }
+/* structure to hold timing data */
-#define DECFUNC()     --_ifuncs;
+struct {
   char *func;
   ulong64 start, end, tot;
 } timings[1000000];
 /* structure to hold consolidated timing data */
 struct _functime { 
   char *func;
   ulong64 tot;
 } functime[1000];
 static char *_funcs[1000];
 int _ifuncs, _itims;
 #define REGFUNC(name) int __IX = _itims++; _funcs[_ifuncs++] = name; timings[__IX].func = name; timings[__IX].start = gettsc();
 #define DECFUNC()     timings[__IX].end = gettsc(); --_ifuncs;
 #define VERIFY(val)   _verify(val, #val, __LINE__);
 /* sort the consolidated timings */
 int qsort_helper(const void *A, const void *B)
 {
   struct _functime *a, *b;
   a = (struct _functime *)A;
   b = (struct _functime *)B;
   if (a->tot > b->tot) return -1;
   if (a->tot < b->tot) return 1;
   return 0;
 }
 /* reset debugging information */
 void reset_timings(void)
 {
   _ifuncs = _itims = 0;
 }   
 /* dump the timing data */
 void dump_timings(void)
 {
   int x, y;
   ulong64 total;
   /* first for every find the total time */
   printf("Phase I  ... Finding totals (%d samples)...\n", _itims);
   for (x = 0; x < _itims; x++) {
       timings[x].tot = timings[x].end - timings[x].start;
   }
   /* now subtract the time for each function where nested functions occured */
   printf("Phase II ... Finding dependencies...\n");
   for (x = 0; x < _itims-1; x++) {
       for (y = x+1; y < _itims && timings[y].start <= timings[x].end; y++) {
           timings[x].tot -= timings[y].tot;
           if (timings[x].tot > ((ulong64)1 << (ulong64)40)) {
              timings[x].tot = 0;
           }
       }
   }
   /* now consolidate all the entries */
   printf("Phase III... Consolidation...\n");
   memset(&functime, 0, sizeof(functime));
   total = 0;
   for (x = 0; x < _itims; x++) {
       total += timings[x].tot;
       /* try to find this entry */
       for (y = 0; functime[y].func != NULL; y++) {
           if (strcmp(timings[x].func, functime[y].func) == 0) {
              break;
           }
       }
       if (functime[y].func == NULL) {
          /* new entry */
          functime[y].func = timings[x].func;
          functime[y].tot  = timings[x].tot;
       } else { 
          functime[y].tot  += timings[x].tot;
       }
   }
   for (x = 0; functime[x].func != NULL; x++);
   /* sort and dump */
   qsort(&functime, x, sizeof(functime[0]), &qsort_helper);
   for (x = 0; functime[x].func != NULL; x++) {
      if (functime[x].tot > 0 && strcmp(functime[x].func, "_verify") != 0) {
         printf("%30s: %20llu (%3llu.%03llu %%)\n", functime[x].func, functime[x].tot, (functime[x].tot * (ulong64)100) / total, ((functime[x].tot * (ulong64)100000) / total) % (ulong64)1000);
      }
   }
 }   
 static void _verify(mp_int *a, char *name, int line)
 {
  int n, y;
  static const char *err[] = { "Null DP", "alloc < used", "digits above used" };
  REGFUNC("_verify");
  /* dp null ? */
  y = 0;
  if (a->dp == NULL) goto error;
@ -52,6 +151,7 @@ static void _verify(mp_int *a, char *name, int line)
  }
  /* ok */
  DECFUNC();
  return;
 error:
  printf("Error (%s) with variable {%s} on line %d\n", err[y], name, line);
@ -64,7 +164,7 @@ error:
  exit(0);
 }
-#else
+#else /* don't use DEBUG stuff so these macros are blank */
 #define REGFUNC(name)
 #define DECFUNC()
@ -76,13 +176,18 @@ error:
 int mp_init(mp_int *a)
 {
    REGFUNC("mp_init");
-    a->dp = calloc(sizeof(mp_digit), 16);
+    
    /* allocate ram required and clear it */
    a->dp = calloc(sizeof(mp_digit), MP_PREC);
    if (a->dp == NULL) {
       DECFUNC();
       return MP_MEM;
    }
    /* set the used to zero, allocated digit to the default precision 
     * and sign to positive */
    a->used  = 0;
-    a->alloc = 16;
+    a->alloc = MP_PREC;
    a->sign  = MP_ZPOS;
    VERIFY(a);
@ -96,8 +201,14 @@ void mp_clear(mp_int *a)
   REGFUNC("mp_clear");
   if (a->dp != NULL) {
      VERIFY(a);
-      memset(a->dp, 0, sizeof(mp_digit) * a->alloc);
+      
      /* first zero the digits */
      memset(a->dp, 0, sizeof(mp_digit) * a->used);
      /* free ram */
      free(a->dp);
      /* reset members to make debugging easier */
      a->dp = NULL;
      a->alloc = a->used = 0;
   }
@ -118,27 +229,26 @@ void mp_exch(mp_int *a, mp_int *b)
 /* grow as required */
 static int mp_grow(mp_int *a, int size)
 {
-   int i;
+   int i, n;
   mp_digit *tmp;
   REGFUNC("mp_grow");
   VERIFY(a);
   /* if the alloc size is smaller alloc more ram */
   if (a->alloc < size) {
-      size += 32 - (size & 15);           /* ensure there are always at least 16 digits extra on top */
+      size += (MP_PREC*2) - (size & (MP_PREC-1));           /* ensure there are always at least 16 digits extra on top */
-      tmp = calloc(sizeof(mp_digit), size);
+      a->dp = realloc(a->dp, sizeof(mp_digit)*size);
-      if (tmp == NULL) {
+      if (a->dp == NULL) {
         DECFUNC();
         return MP_MEM;
      }
-      for (i = 0; i < a->used; i++) {
+
-          tmp[i] = a->dp[i];
+      n = a->alloc;
      }
      free(a->dp);
      a->dp = tmp;
      a->alloc = size;
      for (i = n; i < a->alloc; i++) {
          a->dp[i] = 0;
      }
   }
   DECFUNC();
   return MP_OKAY;
@ -233,8 +343,7 @@ int mp_init_size(mp_int *a, int size)
   REGFUNC("mp_init_size");
   /* pad up so there are at least 16 zero digits */
-   size += 32 - (size & 15);
+   size += (MP_PREC*2) - (size & (MP_PREC-1));           /* ensure there are always at least 16 digits extra on top */
   a->dp = calloc(sizeof(mp_digit), size);
   if (a->dp == NULL) {
      DECFUNC();
@ -270,7 +379,6 @@ int mp_copy(mp_int *a, mp_int *b)
   }
   /* zero b and copy the parameters over */
   mp_zero(b);
   b->used = a->used;
   b->sign = a->sign;
@ -278,6 +386,11 @@ int mp_copy(mp_int *a, mp_int *b)
   for (n = 0; n < a->used; n++) {
       b->dp[n] = a->dp[n];
   }
   /* clear high digits */
   for (n = b->used; n < b->alloc; n++) {
       b->dp[n] = 0;
   }
   DECFUNC();
   return MP_OKAY;
 }
@ -513,7 +626,7 @@ int mp_mod_2d(mp_int *a, int b, mp_int *c)
       c->dp[x] = 0;
   }
   /* clear the digit that is not completely outside/inside the modulus */
-   c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(b % DIGIT_BIT)) - ((mp_digit)1));
+   c->dp[b/DIGIT_BIT] &= (mp_digit)((((mp_digit)1)<<(((mp_digit)b) % DIGIT_BIT)) - ((mp_digit)1));
   mp_clamp(c);
   DECFUNC();
   return MP_OKAY;
@ -697,7 +810,7 @@ int mp_mul_2(mp_int *a, mp_int *b)
   return MP_OKAY;
 }
-/* low level addition */
+/* low level addition, based on HAC pp.594, Algorithm 14.7 */
 static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
 {
   mp_int *x;
@ -709,7 +822,9 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
   VERIFY(b);
   VERIFY(c);
-   /* find sizes */
+   /* find sizes, we let |a| <= |b| which means we have to sort
    * them.  "x" will point to the input with the most digits
    */
   if (a->used > b->used) {
      min = b->used;
      max = a->used;
@ -735,9 +850,11 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
   c->used = max + 1;
   /* add digits from lower part */
   /* set the carry to zero */
   u = 0;
   for (i = 0; i < min; i++) {
-       /* T[i] = A[i] + B[i] + U */   
+       /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */   
       c->dp[i] = a->dp[i] + b->dp[i] + u;
       /* U = carry bit of T[i] */       
@ -774,7 +891,7 @@ static int s_mp_add(mp_int *a, mp_int *b, mp_int *c)
   return MP_OKAY;       
 }
-/* low level subtraction (assumes a > b) */
+/* low level subtraction (assumes a > b), HAC pp.595 Algorithm 14.9 */
 static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
 {
   int olduse, res, min, max, i;
@ -800,6 +917,8 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
   c->used = max;
   /* sub digits from lower part */
   /* set carry to zero */
   u = 0;
   for (i = 0; i < min; i++) {
       /* T[i] = A[i] - B[i] - U */
@ -849,9 +968,8 @@ static int s_mp_sub(mp_int *a, mp_int *b, mp_int *c)
 */
 static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   int olduse, res, pa, pb, ix, iy;
+   int olduse, res, pa, ix;
-   mp_word W[512], *_W;
+   mp_word W[512];
   mp_digit tmpx, *tmpy;
   REGFUNC("fast_s_mp_mul_digs");
   VERIFY(a);
@ -866,7 +984,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   }
   /* clear temp buf (the columns) */
-   memset(W, 0, digs*sizeof(mp_word));
+   memset(W, 0, sizeof(mp_word) * digs);
   /* calculate the columns */
   pa = a->used;
@ -876,23 +994,43 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
        * of output are produced.  So at most we want to make upto "digs" digits
        * of output
        */
       pb = MIN(b->used, digs - ix);
       /* setup some pointer aliases to simplify the inner loop */
       tmpx = a->dp[ix];
       tmpy = b->dp;
       _W   = &(W[ix]);
       /* this adds products to distinct columns (at ix+iy) of W
        * note that each step through the loop is not dependent on
        * the previous which means the compiler can easily unroll
        * the loop without scheduling problems
        */
       {
          register mp_digit tmpx, *tmpy;
          register mp_word  *_W;
          register int      iy, pb;
          /* alias for the the word on the left e.g. A[ix] * A[iy] */
          tmpx = a->dp[ix];
          /* alias for the right side */
          tmpy = b->dp;
          /* alias for the columns, each step through the loop adds a new
             term to each column 
           */
          _W   = W + ix;
          /* the number of digits is limited by their placement.  E.g. 
             we avoid multiplying digits that will end up above the # of
             digits of precision requested
           */
          pb = MIN(b->used, digs - ix);
          for (iy = 0; iy < pb; iy++) {
              *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++); 
          }
       }
   }
   /* setup dest */
   olduse  = c->used;
   c->used = digs;
@ -908,11 +1046,12 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    * N^2 + N*c where c is the cost of the shifting.  On very small numbers
    * this is slower but on most cryptographic size numbers it is faster.
    */
   for (ix = 1; ix < digs; ix++) {
-       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
-   c->dp[digs-1]   = W[digs-1] & ((mp_word)MP_MASK);
+   c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
   /* clear unused */
   for (ix = c->used; ix < olduse; ix++) {
@ -924,7 +1063,10 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   return MP_OKAY;
 }
-/* multiplies |a| * |b| and only computes upto digs digits of result */
+/* multiplies |a| * |b| and only computes upto digs digits of result 
 * HAC pp. 595, Algorithm 14.12  Modified so you can control how many digits of 
 * output are created.  
 */
 static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
   mp_int t;
@ -1001,9 +1143,8 @@ static int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 */
 static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 {
-   int oldused, newused, res, pa, pb, ix, iy;
+   int oldused, newused, res, pa, pb, ix;
-   mp_word W[512], *_W;
+   mp_word W[512];
   mp_digit tmpx, *tmpy;
   REGFUNC("fast_s_mp_mul_high_digs");
   VERIFY(a);
@ -1023,16 +1164,31 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   pb = b->used;
   memset(&W[digs], 0, (pa + pb + 1 - digs) * sizeof(mp_word));
   for (ix = 0; ix < pa; ix++) {
-       /* pointer aliases */
+       {
 	      register mp_digit tmpx, *tmpy;
 	      register int      iy;
 	      register mp_word  *_W;
          /* work todo, that is we only calculate digits that are at "digs" or above  */
          iy   = digs - ix;
          /* copy of word on the left of A[ix] * B[iy] */
          tmpx = a->dp[ix];
-       tmpy = b->dp + (digs - ix);
+          
          /* alias for right side */
          tmpy = b->dp + iy;
          /* alias for the columns of output.  Offset to be equal to or above the 
           * smallest digit place requested 
           */
          _W   = &(W[digs]);
          /* compute column products for digits above the minimum */
-       for (iy = digs - ix; iy < pb; iy++) {
+          for (; iy < pb; iy++) {
              *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
          }
       }
   }
   /* setup dest */
   oldused = c->used;
@ -1040,10 +1196,10 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   /* now convert the array W downto what we need */
   for (ix = digs+1; ix < (pa+pb+1); ix++) {
-       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
+       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       c->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
-   c->dp[(pa+pb+1)-1] = W[(pa+pb+1)-1] & ((mp_word)MP_MASK);
+   c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
   for (ix = c->used; ix < oldused; ix++) {
      c->dp[ix] = 0;
@ -1085,13 +1241,26 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   pa = a->used;
   pb = b->used;
   for (ix = 0; ix < pa; ix++) {
       /* clear the carry */
       u = 0;
       /* left hand side of A[ix] * B[iy] */
       tmpx = a->dp[ix];
       /* alias to the address of where the digits will be stored */       
       tmpt = &(t.dp[digs]);
       /* alias for where to read the right hand side from */       
       tmpy = b->dp + (digs - ix);
       for (iy = digs - ix; iy < pb; iy++) {
           /* calculate the double precision result */
           r       = ((mp_word)*tmpt) + ((mp_word)tmpx) * ((mp_word)*tmpy++) + ((mp_word)u);
           /* get the lower part */
           *tmpt++ = (mp_digit)(r & ((mp_word)MP_MASK));
           /* carry the carry */
           u       = (mp_digit)(r >> ((mp_word)DIGIT_BIT));
       }
       *tmpt = u;
@ -1119,9 +1288,8 @@ static int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
 */
 static int fast_s_mp_sqr(mp_int *a, mp_int *b)
 {
-   int olduse, newused, res, ix, iy, pa;
+   int olduse, newused, res, ix, pa;
-   mp_word  W2[512], W[512], *_W;
+   mp_word  W2[512], W[512];
   mp_digit tmpx, *tmpy;
   REGFUNC("fast_s_mp_sqr");
   VERIFY(a);
@ -1144,9 +1312,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
       /* compute the outer product */
       W2[ix+ix]   += ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
-       /* pointer aliasing! */
+       {
          register mp_digit tmpx, *tmpy;
          register mp_word  *_W;
          register int      iy;
          /* copy of left side */
          tmpx = a->dp[ix];
-	   tmpy = &(a->dp[ix+1]);
+          
          /* alias for right side */
          tmpy = a->dp + (ix + 1);
 	      _W   = &(W[ix+ix+1]);
 	      /* inner products */
@ -1154,6 +1330,7 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
 	          *_W++ += ((mp_word)tmpx) * ((mp_word)*tmpy++);
          }
       }
   }
   /* double first value, since the inner products are half of what they should be */
   W[0] += W[0] + W2[0];
@ -1168,9 +1345,9 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
       W[ix]       += W[ix] + W2[ix];
       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       b->dp[ix-1] = W[ix-1] & ((mp_word)MP_MASK);
+       b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
-   b->dp[(pa+pa+1)-1]   = W[(pa+pa+1)-1] & ((mp_word)MP_MASK);
+   b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
   /* clear high */
   for (ix = b->used; ix < olduse; ix++) {
@ -1185,7 +1362,7 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
   return MP_OKAY;
 }
-/* low level squaring, b = a*a */
+/* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
 static int s_mp_sqr(mp_int *a, mp_int *b)
 {
   mp_int t;
@ -1212,15 +1389,34 @@ static int s_mp_sqr(mp_int *a, mp_int *b)
   t.used = pa + pa + 1;
   for (ix = 0; ix < pa; ix++) {
       /* first calculate the digit at 2*ix */
       /* calculate double precision result */   
       r           = ((mp_word)t.dp[ix+ix]) + ((mp_word)a->dp[ix]) * ((mp_word)a->dp[ix]);
       /* store lower part in result */       
       t.dp[ix+ix] = (mp_digit)(r & ((mp_word)MP_MASK));
       /* get the carry */
 	   u           = (r >> ((mp_word)DIGIT_BIT));
 	   /* left hand side of A[ix] * A[iy] */
 	   tmpx = a->dp[ix];
 	   /* alias for where to store the results */
 	   tmpt = &(t.dp[ix+ix+1]);
 	   for (iy = ix + 1; iy < pa; iy++) {
 	       /* first calculate the product */
 	       r           = ((mp_word)tmpx) * ((mp_word)a->dp[iy]);
 	       /* now calculate the double precision result, note we use
 	        * addition instead of *2 since its easier to optimize
 	        */
 	       r           = ((mp_word)*tmpt) + r + r + ((mp_word)u);
 	       /* store lower part */
           *tmpt++     = (mp_digit)(r & ((mp_word)MP_MASK));
           /* get carry */
    	   u           = (r >> ((mp_word)DIGIT_BIT));
       }
       r           = ((mp_word)*tmpt) + u;
@ -1334,11 +1530,11 @@ int mp_sub(mp_int *a, mp_int *b, mp_int *c)
   return res;
 }
-/* c = |a| * |b| using Karatsuba */
+/* c = |a| * |b| using Karatsuba Multiplication */
 static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
 {
   mp_int x0, x1, y0, y1, t1, t2, x0y0, x1y1;
-   int B, err, neg, x;
+   int B, err, x;
   REGFUNC("mp_karatsuba_mul");
   VERIFY(a);
@ -1396,9 +1592,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
   /* now calc x1-x0 and y1-y0 */
   if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1Y1;               /* t1 = x1 - x0 */
   if (mp_sub(&y1, &y0, &t2) != MP_OKAY) goto X1Y1;               /* t2 = y1 - y0 */
   neg = (t1.sign == t2.sign) ? MP_ZPOS : MP_NEG;
   if (mp_mul(&t1, &t2, &t1) != MP_OKAY) goto X1Y1;               /* t1 = (x1 - x0) * (y1 - y0) */
   t1.sign = neg;
   /* add x0y0 */
   if (mp_add(&x0y0, &x1y1, &t2) != MP_OKAY) goto X1Y1;           /* t2 = x0y0 + x1y1 */
@ -1538,7 +1732,15 @@ int mp_sqr(mp_int *a, mp_int *b)
 }
-/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] */
+/* integer signed division. c*b + d == a [e.g. a/b, c=quotient, d=remainder] 
 * HAC pp.598 Algorithm 14.20
 *
 * Note that the description in HAC is horribly incomplete.  For example, 
 * it doesn't consider the case where digits are removed from 'x' in the inner
 * loop.  It also doesn't consider the case that y has fewer than three digits, etc..
 *
 * The overall algorithm is as described as 14.20 from HAC but fixed to treat these cases.
 */
 int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
 {
   mp_int q, x, y, t1, t2;
@ -1596,7 +1798,7 @@ int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d)
   neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
   x.sign = y.sign = MP_ZPOS;
-   /* normalize */
+   /* normalize both x and y, ensure that y >= b/2, [b == 2^DIGIT_BIT] */
   norm = 0;
   while ((y.dp[y.used-1] & (((mp_digit)1)<<(DIGIT_BIT-1))) == ((mp_digit)0)) {
      ++norm;
@ -1927,7 +2129,7 @@ int mp_expt_d(mp_int *a, mp_digit b, mp_int *c)
          return res;
       }
-       if (b & (mp_digit)(1<<(DIGIT_BIT-1))) {
+       if ((b & (mp_digit)(1<<(DIGIT_BIT-1))) != 0) {
          if ((res = mp_mul(c, &g, c)) != MP_OKAY) {
             mp_clear(&g);
             DECFUNC();
@ -2106,8 +2308,12 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c)
   k = 0;
   while ((u.dp[0] & 1) == 0 && (v.dp[0] & 1) == 0) {
       ++k;
-       mp_div_2d(&u, 1, &u, NULL);
+       if ((res = mp_div_2d(&u, 1, &u, NULL)) != MP_OKAY) {
-       mp_div_2d(&v, 1, &v, NULL);
+          goto __T;
       }
       if ((res = mp_div_2d(&v, 1, &v, NULL)) != MP_OKAY) {
          goto __T;
       }
   }
   /* B2.  Initialize */
@ -2125,7 +2331,9 @@ int mp_gcd(mp_int *a, mp_int *b, mp_int *c)
   do {
      /* B3 (and B4).  Halve t, if even */
      while (t.used != 0 && (t.dp[0] & 1) == 0) {
-          mp_div_2d(&t, 1, &t, NULL);
+          if ((res = mp_div_2d(&t, 1, &t, NULL)) != MP_OKAY) {
             goto __T;
          }
      }
      /* B5.  if t>0 then u=t otherwise v=-t */
@ -2563,7 +2771,9 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
   return res;   
 }
-/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup */
+/* reduces x mod m, assumes 0 < x < m^2, mu is precomputed via mp_reduce_setup 
 * From HAC pp.604 Algorithm 14.42 
 */
 int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
 {
  mp_int   q;
@ -2595,10 +2805,14 @@ int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
  mp_rshd(&q, um + 1);       /* q3 = q2 / b^(k+1) */
  /* x = x mod b^(k+1), quick (no division) */
-  mp_mod_2d(x, DIGIT_BIT * (um + 1), x);
+  if ((res = mp_mod_2d(x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) {
     goto CLEANUP;
  }
  /* q = q * m mod b^(k+1), quick (no division) */
-  s_mp_mul_digs(&q, m, &q, um + 1);
+  if ((res = s_mp_mul_digs(&q, m, &q, um + 1)) != MP_OKAY) {
     goto CLEANUP;
  }
  /* x = x - q */
  if((res = mp_sub(x, &q, x)) != MP_OKAY)
@ -2626,6 +2840,11 @@ int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
  return res;
 }
 /* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 
 *
 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
 * The value of k changes based on the size of the exponent.
 */
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
   mp_int M[64], res, mu;
@ -2648,7 +2867,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
-      if ((err = mp_init(&M[x])) != MP_OKAY) {
+      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
         for (y = 0; y < x; y++) {
            mp_clear(&M[y]);
         }
@ -2726,7 +2945,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
            break;
         }
         buf = X->dp[digidx--];
-         bitcnt = DIGIT_BIT;
+         bitcnt = (int)DIGIT_BIT;
      }
      /* grab the next msb from the exponent */
@ -2793,7 +3012,7 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
         }
         bitbuf <<= 1;
-         if (bitbuf & (1<<winsize)) {
+         if ((bitbuf & (1<<winsize)) != 0) {
            /* then multiply */
            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
               goto __MU;
@ -3044,7 +3263,7 @@ int mp_read_radix(mp_int *a, char *str, int radix)
      return MP_VAL;
   }
-   if (*str == (unsigned char)'-') {
+   if (*str == '-') {
      ++str;
      neg = MP_NEG;
   } else {
@ -3053,7 +3272,7 @@ int mp_read_radix(mp_int *a, char *str, int radix)
   mp_zero(a);
   while (*str) {
-      ch = (radix < 36) ? toupper(*str) : *str;
+      ch = (char)((radix < 36) ? toupper(*str) : *str);
      for (y = 0; y < 64; y++) {
          if (ch == s_rmap[y]) {
             break;
--- a/bn.h
+++ b/bn.h
@ -50,7 +50,7 @@
   typedef unsigned long      mp_digit;
   typedef ulong64            mp_word;
-   #define DIGIT_BIT          28U
+   #define DIGIT_BIT          28
 #endif  
 #ifndef DIGIT_BIT
@ -77,13 +77,10 @@
 typedef int           mp_err;
 /* you'll have to tune these... */
 #ifdef FAST_FPU
   #define KARATSUBA_MUL_CUTOFF    100     /* Min. number of digits before Karatsuba multiplication is used. */
   #define KARATSUBA_SQR_CUTOFF    100     /* Min. number of digits before Karatsuba squaring is used. */
 #else
 #define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
 #define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
-#endif
+
 #define MP_PREC                 64      /* default digits of precision */
 typedef struct  {
    int used, alloc, sign;
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
-\title{LibTomMath v0.07 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.08 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@ -484,23 +484,23 @@ Multiply & 128 & 1,426   & 451     \\
 Multiply & 256 & 2,551   & 958     \\
 Multiply & 512 & 7,913   & 2,476     \\
 Multiply & 1024 & 28,496   & 7,927   \\
-Multiply & 2048 & 109,897   & 282,24     \\
+Multiply & 2048 & 109,897   & 28,224     \\
-Multiply & 4096 & 469,970   & 104,681     \\
+Multiply & 4096 & 469,970   & 101,171     \\
 \hline 
 Square & 128 & 1,319   & 511     \\
 Square & 256 & 1,776   & 947     \\
 Square & 512 & 5,399  & 2,153    \\
 Square & 1024 & 18,991  & 5,733     \\
 Square & 2048 & 72,126  & 17,621    \\
-Square & 4096 & 306,269  & 70,168  \\
+Square & 4096 & 306,269  & 67,576   \\
 \hline 
-Exptmod & 512 & 32,021,586  & 4,472,406   \\
+Exptmod & 512 & 32,021,586  & 4,138,354 \\
-Exptmod & 768 & 97,595,492  & 10,427,845    \\
+Exptmod & 768 & 97,595,492  & 9,840,233 \\
-Exptmod & 1024 & 223,302,532  & 20,561,722    \\
+Exptmod & 1024 & 223,302,532  & 20,624,553     \\
-Exptmod & 2048 & 1,682,223,369   & 113,978,803     \\
+Exptmod & 2048 & 1,682,223,369   & 114,936,361      \\
-Exptmod & 2560 & 3,268,615,571   & 236,650,133      \\
+Exptmod & 2560 & 3,268,615,571   & 229,402,426       \\
-Exptmod & 3072 & 5,597,240,141   & 373,449,291     \\
+Exptmod & 3072 & 5,597,240,141   & 367,403,840      \\
-Exptmod & 4096 & 13,347,270,891   & 787,568,457     
+Exptmod & 4096 & 13,347,270,891   & 779,058,433      
 \end{tabular}
 \end{center}
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,11 @@
 Jan 2nd, 2003
 v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
       -- Corrected a bunch of small "warnings"
       -- Added more comments
       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
       -- Corrected some bugs where error messages were potentially ignored
       -- add etc/pprime.c program which makes numbers which are provably prime.
 Jan 1st, 2003
 v0.07  -- Removed alot of heap operations from core functions to speed them up
       -- Added a root finding function [and mp_sqrt macro like from MPI]
--- a/demo.c
+++ b/demo.c
@ -23,7 +23,6 @@ extern ulong64 rdtsc(void);
 extern void reset(void);
 #else 
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
@ -33,6 +32,8 @@ ulong64 rdtsc(void) { return clock() - _tt; }
 int _ifuncs;
 #else
 extern int _ifuncs;
 extern void dump_timings(void);
 extern void reset_timings(void);
 #endif
 void ndraw(mp_int *a, char *name)
@ -103,10 +104,24 @@ int main(void)
   mp_read_radix(&b, "4982748972349724892742", 10);
   mp_sub_d(&a, 1, &c);
 #ifdef DEBUG
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   reset_timings();
 #endif   
   mp_exptmod(&b, &c, &a, &d);
 #ifdef DEBUG   
   dump_timings();
   return 0;
 #endif   
   mp_toradix(&d, buf, 10);
   printf("b^p-1 == %s\n", buf);
 #ifdef TIMER      
      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
@ -196,6 +211,7 @@ int main(void)
   }
   }   
   mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
   mp_read_radix(&b, "234892374891378913789237289378973232333", 10);
   while (a.used * DIGIT_BIT < 8192) {
--- a/etc/makefile
+++ b/etc/makefile
@ -0,0 +1 @@
 CFLAGS += -I../ -Wall -W -O3 -fomit-frame-pointer -funroll-loops ../bn.c 
--- a/etc/pprime.c
+++ b/etc/pprime.c
@ -0,0 +1,281 @@
 /* Generates provable primes
 *
 * See http://iahu.ca:8080/papers/pp.pdf for more info.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://tom.iahu.ca
 */
 #include <time.h>
 #include "bn.h"
 /* fast square root */
 static mp_digit i_sqrt(mp_word x)
 {
   mp_word x1, x2;
   x2 = x;
   do {
      x1 = x2;
      x2 = x1 - ((x1 * x1) - x)/(2*x1);
   } while (x1 != x2);
   if (x1*x1 > x) {
      --x1;
   }
   return x1;
 }   
 /* generates a prime digit */
 static mp_digit prime_digit()
 {
   mp_digit r, x, y, next;
   /* make a DIGIT_BIT-bit random number */
   for (r = x = 0; x < DIGIT_BIT; x++) {
       r = (r << 1) | (rand() & 1);
   }
   /* now force it odd */
   r |= 1;
   /* force it to be >30 */
   if (r < 30) {
      r += 30;
   }
   /* get square root, since if 'r' is composite its factors must be < than this */
   y = i_sqrt(r);
   next = (y+1)*(y+1);
   do {
      r += 2;   /* next candidate */
      /* update sqrt ? */
      if (next <= r) {
         ++y;
         next = (y+1)*(y+1);
      }
      /* loop if divisible by 3,5,7,11,13,17,19,23,29  */
      if ((r % 3) == 0) { x = 0; continue; }
      if ((r % 5) == 0) { x = 0; continue; }
      if ((r % 7) == 0) { x = 0; continue; }
      if ((r % 11) == 0) { x = 0; continue; }
      if ((r % 13) == 0) { x = 0; continue; }
      if ((r % 17) == 0) { x = 0; continue; }
      if ((r % 19) == 0) { x = 0; continue; }
      if ((r % 23) == 0) { x = 0; continue; }
      if ((r % 29) == 0) { x = 0; continue; }
      /* now check if r is divisible by x + k={1,7,11,13,17,19,23,29} */
      for (x = 30; x <= y; x += 30) {
          if ((r % (x+1)) == 0) { x = 0; break; }
          if ((r % (x+7)) == 0) { x = 0; break; }
          if ((r % (x+11)) == 0) { x = 0; break; }
          if ((r % (x+13)) == 0) { x = 0; break; }
          if ((r % (x+17)) == 0) { x = 0; break; }
          if ((r % (x+19)) == 0) { x = 0; break; }
          if ((r % (x+23)) == 0) { x = 0; break; }
          if ((r % (x+29)) == 0) { x = 0; break; }
      }
   } while (x == 0);
   return r;
 }
 /* makes a prime of at least k bits */
 int pprime(int k, mp_int *p, mp_int *q)
 {
   mp_int a, b, c, n, x, y, z, v;
   int res;
   /* single digit ? */
   if (k <= (int)DIGIT_BIT) {
      mp_set(p, prime_digit());
      return MP_OKAY;
   }
   if ((res = mp_init(&c)) != MP_OKAY) {
      return res;
   }
   if ((res = mp_init(&v)) != MP_OKAY) {
      goto __C;
   }
   /* product of first 50 primes */
   if ((res = mp_read_radix(&v, "19078266889580195013601891820992757757219839668357012055907516904309700014933909014729740190", 10)) != MP_OKAY) {
      goto __V;
   }   
   if ((res = mp_init(&a)) != MP_OKAY) {
      goto __V;
   }
   /* set the prime */
   mp_set(&a, prime_digit());
   if ((res = mp_init(&b)) != MP_OKAY) {
      goto __A;
   }
   if ((res = mp_init(&n)) != MP_OKAY) {
      goto __B;
   }
   if ((res = mp_init(&x)) != MP_OKAY) {
      goto __N;
   }
   if ((res = mp_init(&y)) != MP_OKAY) {
      goto __X;
   }
   if ((res = mp_init(&z)) != MP_OKAY) {
      goto __Y;
   }
   /* now loop making the single digit */
   while (mp_count_bits(&a) < k) {
      printf("prime is %4d bits left\r", k - mp_count_bits(&a)); fflush(stdout);
   top: 
      mp_set(&b, prime_digit());
      /* now compute z = a * b * 2 */
      if ((res = mp_mul(&a, &b, &z)) != MP_OKAY) {                     /* z = a * b */
         goto __Z;
      }
      if ((res = mp_copy(&z, &c)) != MP_OKAY) {                        /* c = a * b */
         goto __Z;
      }
      if ((res = mp_mul_2(&z, &z)) != MP_OKAY) {                       /* z = 2 * a * b */
         goto __Z;
      }
      /* n = z + 1 */
      if ((res = mp_add_d(&z, 1, &n)) != MP_OKAY) {                    /* n = z + 1 */
         goto __Z;
      }
      /* check (n, v) == 1 */
      if ((res = mp_gcd(&n, &v, &y)) != MP_OKAY) {                     /* y = (n, v) */
         goto __Z;
      }
      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
      /* now try base x=2  */
      mp_set(&x, 2);
      /* compute x^a mod n */
      if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
         goto __Z;
      }
      /* if y == 1 loop */
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* now x^2a mod n */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
         goto __Z;
      }
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* compute x^b mod n */
      if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
          goto __Z;
      }
      /* if y == 1 loop */
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* now x^2b mod n */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
         goto __Z;
      }
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* compute x^c mod n == x^ab mod n */
      if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
          goto __Z;
      }
      /* if y == 1 loop */
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* now compute (x^c mod n)^2 */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
         goto __Z;
      }
      /* y should be 1 */
      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
 /*
 {
   char buf[4096];
   mp_toradix(&n, buf, 10);
   printf("Certificate of primality for:\n%s\n\n", buf);
   mp_toradix(&a, buf, 10);
   printf("A == \n%s\n\n", buf);
   mp_toradix(&b, buf, 10);
   printf("B == \n%s\n", buf);
   printf("----------------------------------------------------------------\n");
 }   
 */
     /* a = n */
     mp_copy(&n, &a);
  }     
  mp_exch(&n, p);
  mp_exch(&b, q);
   res = MP_OKAY;
 __Z: mp_clear(&z); 
 __Y: mp_clear(&y); 
 __X: mp_clear(&x); 
 __N: mp_clear(&n); 
 __B: mp_clear(&b); 
 __A: mp_clear(&a); 
 __V: mp_clear(&v);
 __C: mp_clear(&c);
   return res;
 }   
 int main(void)
 {
   mp_int p, q;
   char buf[4096];
   int k;
   clock_t t1;
   srand(time(NULL));
   printf("Enter # of bits: \n");
   scanf("%d", &k);
   mp_init(&p);
   mp_init(&q);
   t1 = clock();   
   pprime(k, &p, &q);
   t1 = clock() - t1;
   printf("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits(&p));
   mp_toradix(&p, buf, 10);
   printf("P == %s\n", buf);
   mp_toradix(&q, buf, 10);
   printf("Q == %s\n", buf);
   return 0;
 }
--- a/11
+++ b/11
@ -1,7 +1,7 @@
 CC = gcc
-CFLAGS  += -DDEBUG -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
+CFLAGS  += -Wall -W -O3 -fomit-frame-pointer -funroll-loops 
-VERSION=0.07
+VERSION=0.08
 default: test
@ -9,6 +9,11 @@ test: bn.o demo.o
 	$(CC) bn.o demo.o -o demo
 	cd mtest ; gcc -O3 -fomit-frame-pointer -funroll-loops mtest.c -o mtest.exe -s
 # builds the x86 demo
 test86:
 	nasm -f coff timer.asm
 	$(CC) -DDEBUG -DTIMER_X86 $(CFLAGS) bn.c demo.c timer.o -o demo -s
 docdvi: bn.tex
 	latex bn
@ -17,7 +22,7 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 clean:
-	rm -f *.o *.exe mtest/*.exe bn.log bn.aux bn.dvi *.s 
+	rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s 
 zipup: clean docs
 	chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -41,7 +41,7 @@ void rand_num(mp_int *a)
   unsigned char buf[512];
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   for (n = 0; n < size; n++) {
@ -57,7 +57,7 @@ void rand_num2(mp_int *a)
   unsigned char buf[512];
 top:
-   size = 1 + ((fgetc(rng)*fgetc(rng)) % 512);
+   size = 1 + ((fgetc(rng)*fgetc(rng)) % 32);
   buf[0] = (fgetc(rng)&1)?1:0;
   fread(buf+1, 1, size, rng);
   for (n = 0; n < size; n++) {
@ -80,6 +80,13 @@ int main(void)
   mp_init(&e);
   rng = fopen("/dev/urandom", "rb");
   if (rng == NULL) {
      rng = fopen("/dev/random", "rb");
      if (rng == NULL) {
         fprintf(stderr, "\nWarning:  stdin used as random source\n\n");
         rng = stdin;
      }
   }
   for (;;) {
       n = fgetc(rng) % 11;
--- a/timer.asm
+++ b/timer.asm
@ -9,6 +9,12 @@
 [section .data]
 timer dd 0, 0
 [section .text]
 [global _gettsc]
 _gettsc:
   rdtsc
   ret
 [global _rdtsc]
 _rdtsc:
   rdtsc
		`@ -0,0 +1 @@`
							`CFLAGS += -I../ -Wall -W -O3 -fomit-frame-pointer -funroll-loops ../bn.c`