added libtommath-0.10

2003-02-28 16:07:32 +00:00 · 2003-02-28 16:07:32 +00:00 · fb93a30a25
commit fb93a30a25
parent 40c00add00
10 changed files with 954 additions and 98 deletions
--- a/bn.c
+++ b/bn.c
@ -796,7 +796,7 @@ int mp_mul_2(mp_int *a, mp_int *b)
      DECFUNC();
      return res;
   }
-   b->used = b->alloc;
+   ++b->used;
   /* shift any bit count < DIGIT_BIT */
   r = 0;
@ -1017,7 +1017,6 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
           */
          _W   = W + ix;
          /* the number of digits is limited by their placement.  E.g. 
             we avoid multiplying digits that will end up above the # of
             digits of precision requested
@ -2840,11 +2839,404 @@ int mp_reduce(mp_int *x, mp_int *m, mp_int *mu)
  return res;
 }
 /* setups the montgomery reduction stuff */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp)
 {
   mp_int t, tt;
   int res;
   if ((res = mp_init(&t)) != MP_OKAY) {
      return res;
   }
   if ((res = mp_init(&tt)) != MP_OKAY) {
      goto __T;
   }
   /* tt = b */
   tt.dp[0] = 0;
   tt.dp[1] = 1;
   tt.used  = 2;
   /* t = m mod b */
   t.dp[0] = a->dp[0];
   t.used  = 1;
   /* t = 1/m mod b */
   if ((res = mp_invmod(&t, &tt, &t)) != MP_OKAY) {
      goto __TT;
   }
   /* t = -1/m mod b */
   *mp = ((mp_digit)1 << ((mp_digit)DIGIT_BIT)) - t.dp[0];
   res = MP_OKAY;
 __TT: mp_clear(&tt);   
 __T:  mp_clear(&t);
   return res;
 }   
 /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
 static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 {
   int ix, res, olduse;
   mp_digit ui;
   mp_word  W[512];
   /* get old used count */
   olduse = a->used;
   /* grow a as required */
   if (a->alloc < m->used*2+1) {
      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
         return res;
      }
   }
   /* copy and clear */
   for (ix = 0; ix < a->used; ix++) {
       W[ix] = a->dp[ix];
   }
   for (; ix < m->used * 2 + 1; ix++) {
       W[ix] = 0;
   }
   for (ix = 0; ix < m->used; ix++) {
       /* ui = ai * m' mod b */
       ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
       /* a = a + ui * m * b^i */
       { 
          register int      iy;
          register mp_digit *tmpx;
          register mp_word  *_W;
          /* aliases */
          tmpx = m->dp;
          _W   = W + ix;
          for (iy = 0; iy < m->used; iy++) {
              *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
          }
          /* now fix carry for W[ix+1] */
          W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
          W[ix]   &= ((mp_word)MP_MASK);
       }
   }
   /* nox fix rest of carries */
   for (; ix <= m->used * 2 + 1; ix++) {
       W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
       W[ix-1] &= ((mp_word)MP_MASK);
   }
   /* copy out */
   /* A = A/b^n */
   for (ix = 0; ix < m->used + 1; ix++) { 
       a->dp[ix] = W[ix+m->used];
   }
   a->used = m->used + 1;
   /* zero oldused digits */  
   for (; ix < olduse; ix++) {
       a->dp[ix] = 0;
   }
   mp_clamp(a);
   /* if A >= m then A = A - m */
   if (mp_cmp_mag(a, m) != MP_LT) {
      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
         return res;
      }
   }
   return MP_OKAY;
 }
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 {
   int ix, res, digs;
   mp_digit ui;
   REGFUNC("mp_montgomery_reduce");
   VERIFY(a);
   VERIFY(m);
   digs = m->used * 2 + 1;
   if ((digs < 512) && digs < (1<<( (CHAR_BIT*sizeof(mp_word)) - (2*DIGIT_BIT)))) {
      res = fast_mp_montgomery_reduce(a, m, mp);
      DECFUNC();
      return res;
   }  
   if (a->alloc < m->used*2+1) {
      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
         DECFUNC();
         return res;
      }
   }
   a->used = m->used * 2 + 1;
   for (ix = 0; ix < m->used; ix++) {
       /* ui = ai * m' mod b */
       ui = (a->dp[ix] * mp) & MP_MASK;
       /* a = a + ui * m * b^i */
       { 
          register int      iy;
          register mp_digit *tmpx, *tmpy, mu;
          register mp_word  r;
          /* aliases */
          tmpx = m->dp;
          tmpy = a->dp + ix;
          mu  = 0;
          for (iy = 0; iy < m->used; iy++) {
              r             = ((mp_word)ui) * ((mp_word)*tmpx++) + ((mp_word)mu) + ((mp_word)*tmpy);
              mu            = (r >> ((mp_word)DIGIT_BIT));
              *tmpy++       = (r & ((mp_word)MP_MASK));
          }
          /* propagate carries */
          while (mu) {
             *tmpy            += mu;
             mu                = (*tmpy>>DIGIT_BIT)&1;
             *tmpy++          &= MP_MASK;
          }
       }
   }
   /* A = A/b^n */
   mp_rshd(a, m->used);
   /* if A >= m then A = A - m */
   if (mp_cmp_mag(a, m) != MP_LT) {
      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
         DECFUNC();
         return res;
      }
   }
   DECFUNC();
   return MP_OKAY;
 }
 /* computes Y == G^X mod P, HAC pp.616, Algorithm 14.85 
 *
 * Uses a left-to-right k-ary sliding window to compute the modular exponentiation.
 * The value of k changes based on the size of the exponent.
 *
 * Uses Montgomery reduction 
 */
 static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
   mp_int M[64], res;
   mp_digit buf, mp;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   REGFUNC("mp_exptmod_fast");
   VERIFY(G);
   VERIFY(X);
   VERIFY(P);
   VERIFY(Y);
   /* find window size */
   x = mp_count_bits(X);
        if (x <= 18)    { winsize = 2; }
   else if (x <= 84)    { winsize = 3; }
   else if (x <= 300)   { winsize = 4; }
   else if (x <= 930)   { winsize = 5; }
   else                 { winsize = 6; }
   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
      if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
         for (y = 0; y < x; y++) {
            mp_clear(&M[y]);
         }
         DECFUNC();
         return err;
      }
   }
   /* now setup montgomery  */
   if ((err = mp_montgomery_setup(P, &mp)) != MP_OKAY) {
      goto __M;
   }
   /* setup result */
   if ((err = mp_init(&res)) != MP_OKAY) {
      goto __M;
   }
   /* now we need R mod m */
   mp_set(&res, 1);           
   if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
      goto __RES;
   }
   /* res = R mod m */
   if ((err = mp_mod(&res, P, &res)) != MP_OKAY) {
      goto __RES;
   }   
   /* create M table 
    *
    * The M table contains powers of the input base, e.g. M[x] = G^x mod P
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __RES;
   }
   /* now set M[1] to G * R mod m */
   if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) {
      goto __RES;
   }
   /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
   if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
      goto __RES;
   }
   for (x = 0; x < (winsize-1); x++) {
       if ((err = mp_sqr(&M[1<<(winsize-1)], &M[1<<(winsize-1)])) != MP_OKAY) {
          goto __RES;
       }
       if ((err = mp_montgomery_reduce(&M[1<<(winsize-1)], P, mp)) != MP_OKAY) {
          goto __RES;
       }
   }  
   /* create upper table */
   for (x = (1<<(winsize-1))+1; x < (1 << winsize); x++) {
       if ((err = mp_mul(&M[x-1], &M[1], &M[x])) != MP_OKAY) {
          goto __RES;
       }
       if ((err = mp_montgomery_reduce(&M[x], P, mp)) != MP_OKAY) {
          goto __RES;
       }
   }
   /* set initial mode and bit cnt */
   mode   = 0;
   bitcnt = 0;
   buf    = 0;
   digidx = X->used - 1;
   bitcpy = bitbuf = 0;
   bitcnt = 1;
   for (;;) {
      /* grab next digit as required */
      if (--bitcnt == 0) {
         if (digidx == -1) {
            break;
         }
         buf = X->dp[digidx--];
         bitcnt = (int)DIGIT_BIT;
      }
      /* grab the next msb from the exponent */
      y = (buf >> (DIGIT_BIT - 1)) & 1;
      buf <<= 1;
      /* if the bit is zero and mode == 0 then we ignore it 
       * These represent the leading zero bits before the first 1 bit
       * in the exponent.  Technically this opt is not required but it 
       * does lower the # of trivial squaring/reductions used
       */
      if (mode == 0 && y == 0) continue;
      /* if the bit is zero and mode == 1 then we square */
      if (y == 0 && mode == 1) {
         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
            goto __RES;
         }
         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
            goto __RES;
         }
         continue;
      }
      /* else we add it to the window */
      bitbuf  |= (y<<(winsize-++bitcpy));
      mode     = 2;
      if (bitcpy == winsize) {
         /* ok window is filled so square as required and multiply multiply */
         /* square first */
         for (x = 0; x < winsize; x++) {
            if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
               goto __RES;
            }
            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
               goto __RES;
            }
         }
         /* then multiply */
         if ((err = mp_mul(&res, &M[bitbuf], &res)) != MP_OKAY) {
            goto __RES;
         }
         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
            goto __RES;
         }
         /* empty window and reset */
         bitcpy = bitbuf = 0;
         mode   = 1;
      }
   }
   /* if bits remain then square/multiply */
   if (mode == 2 && bitcpy > 0) {
      /* square then multiply if the bit is set */
      for (x = 0; x < bitcpy; x++) {
         if ((err = mp_sqr(&res, &res)) != MP_OKAY) {
            goto __RES;
         }
         if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
            goto __RES;
         }
         bitbuf <<= 1;
         if ((bitbuf & (1<<winsize)) != 0) {
            /* then multiply */
            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
               goto __RES;
            }
            if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
               goto __RES;
            }
         }
      }
   }
   /* fixup result */
   if ((err = mp_montgomery_reduce(&res, P, mp)) != MP_OKAY) {
      goto __RES;
   } 
   mp_exch(&res, Y);
   err = MP_OKAY;
 __RES: mp_clear(&res);
 __M  :
   for (x = 0; x < (1<<winsize); x++) {
      mp_clear(&M[x]);
   }
   DECFUNC();
   return err;
 }
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
   mp_int M[64], res, mu;
@ -2857,6 +3249,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   VERIFY(P);
   VERIFY(Y);
   /* if the modulus is odd use the fast method */
   if (mp_isodd(P) == 1 && P->used > 4 && P->used < MONTGOMERY_EXPT_CUTOFF) {
      err = mp_exptmod_fast(G, X, P, Y);
      DECFUNC();
      return err;
   }   
   /* find window size */
   x = mp_count_bits(X);
        if (x <= 18)    { winsize = 2; }
@ -2918,7 +3317,8 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
          goto __MU;
       }
   }
-   /* init result */
+   
   /* setup result */
   if ((err = mp_init(&res)) != MP_OKAY) {
      goto __MU;
   }
@ -3009,10 +3409,10 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
         if ((bitbuf & (1<<winsize)) != 0) {
            /* then multiply */
            if ((err = mp_mul(&res, &M[1], &res)) != MP_OKAY) {
-               goto __MU;
+               goto __RES;
            }
            if ((err = mp_reduce(&res, P, &mu)) != MP_OKAY) {
-               goto __MU;
+               goto __RES;
            }
         }
      }
@ -3060,10 +3460,8 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
   neg     = a->sign;
   a->sign = MP_ZPOS;
-   /* t2 = a */
+   /* t2 = 2 */
-   if ((res = mp_copy(a, &t2)) != MP_OKAY) {
+   mp_set(&t2, 2);
      goto __T3;
   }
   do {
      /* t1 = t2 */
@ -3098,15 +3496,19 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c)
      }
   } while (mp_cmp(&t1, &t2) != MP_EQ);
-   /* result can be at most off by one so check */
+   /* result can be off by a few so check */
-   if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
+   for (;;) {
-      goto __T3;
+      if ((res = mp_expt_d(&t1, b, &t2)) != MP_OKAY) {
   }
   if (mp_cmp(&t2, a) == MP_GT) {
      if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
         goto __T3;
      }
      if (mp_cmp(&t2, a) == MP_GT) {
         if ((res = mp_sub_d(&t1, 1, &t1)) != MP_OKAY) {
            goto __T3;
         }
      } else {
         break;
      }
   }      
   /* reset the sign of a first */
@ -3336,13 +3738,14 @@ int mp_to_signed_bin(mp_int *a, unsigned char *b)
 /* get the size for an unsigned equivalent */
 int mp_unsigned_bin_size(mp_int *a)
 {
-   return (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+   int size = mp_count_bits(a);
   return (size/8 + ((size&7) != 0 ? 1 : 0));
 }
 /* get the size for an signed equivalent */
 int mp_signed_bin_size(mp_int *a)
 {
-   return 1 + (mp_count_bits(a)/8 + ((mp_count_bits(a)&7) != 0 ? 1 : 0));
+   return 1 + mp_unsigned_bin_size(a);
 }
 /* read a string [ASCII] in a given radix */
@ -3431,7 +3834,10 @@ int mp_radix_size(mp_int *a, int radix)
   mp_int t;
   mp_digit d;
-   digs = 0;
+   /* special case for binary */
   if (radix == 2) {
      return mp_count_bits(a) + (a->sign == MP_NEG ? 1 : 0) + 1;
   }
   if (radix < 2 || radix > 64) {
      return 0;
@ -3441,6 +3847,7 @@ int mp_radix_size(mp_int *a, int radix)
      return 0;
   }
   digs = 0;
   if (t.sign == MP_NEG) { 
      ++digs;
      t.sign = MP_ZPOS;
--- a/bn.h
+++ b/bn.h
@ -84,6 +84,7 @@ typedef int           mp_err;
 /* you'll have to tune these... */
 #define KARATSUBA_MUL_CUTOFF    80      /* Min. number of digits before Karatsuba multiplication is used. */
 #define KARATSUBA_SQR_CUTOFF    80      /* Min. number of digits before Karatsuba squaring is used. */
 #define MONTGOMERY_EXPT_CUTOFF  40      /* max. number of digits that montgomery reductions will help for */
 #define MP_PREC                 64      /* default digits of precision */
@ -114,7 +115,7 @@ int mp_shrink(mp_int *a);
 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
 #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
-#define mp_isodd(a)  (((a)->used > 0 || (((a)->dp[0] & 1) == 1)) ? 1 : 0)
+#define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
 /* set to zero */
 void mp_zero(mp_int *a);
@ -257,6 +258,12 @@ int mp_reduce_setup(mp_int *a, mp_int *b);
 */
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}
-\title{LibTomMath v0.09 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@ -279,6 +279,22 @@ int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
 /* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
 int mp_jacobi(mp_int *a, mp_int *n, int *c);
 /* used to setup the Barrett reduction for a given modulus b */
 int mp_reduce_setup(mp_int *a, mp_int *b);
 /* Barrett Reduction, computes a (mod b) with a precomputed value c  
 *
 * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
 * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
 */
 int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
 /* setups the montgomery reduction */
 int mp_montgomery_setup(mp_int *a, mp_digit *mp);
 /* computes xR^-1 == x (mod N) via Montgomery Reduction */
 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
 /* d = a^b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
 \end{verbatim}
@ -456,16 +472,33 @@ it is not.
 \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
 exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
-chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett reductions are used
+chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery 
-to reduce the squared or multiplied temporary results modulo $c$.  A Barrett reduction requires one division that is
+reductions are used to reduce the squared or multiplied temporary results modulo $c$.
 performed only and two half multipliers of $N$ digit numbers resulting in approximation $O((N^2)/2)$ work.  
-Let $\gamma = \lfloor \alpha/k \rfloor + k$ represent the total multiplications.  The total work of a exponentiation is
+\subsection{Fast Modular Reductions}
-therefore, $O(3 \cdot N^2 + (\alpha + \gamma) \cdot ((N^2)/2) + \alpha \cdot ((N^2 + N)/2) + \gamma \cdot N^2)$ which 
+
-simplies to $O(3 \cdot N^2 + \gamma N^2 + \alpha (N^2 + (N/2)))$.  The bulk of the time is spent in the Barrett 
+\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
-reductions and the squaring algorithms.  Since $\gamma < \alpha$ it makes sense to optimize first the Barrett and
+Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes 
-squaring routines first.  Significant improvements in the future will most likely stem from optimizing these instead
+$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the 
-of optimizing the multipliers.
+function mp\_reduce\_setup().
 The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
 have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial
 multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants.  Barrett reduction 
 can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications.  
 \subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
 Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then
 $R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
 The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
 The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine
 is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can 
 reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  
 Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply
 by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the
 exptmod function can be made to take advantage of this such that only one normalization at the end is required.  
 \section{Timing Analysis}
 \subsection{Observed Timings}
@ -503,9 +536,9 @@ Square & 1024 & 18,991  & 5,733     \\
 Square & 2048 & 72,126  & 17,621    \\
 Square & 4096 & 306,269  & 67,576   \\
 \hline 
-Exptmod & 512 & 32,021,586  & 4,138,354 \\
+Exptmod & 512 & 32,021,586  & 3,118,435 \\
-Exptmod & 768 & 97,595,492  & 9,840,233 \\
+Exptmod & 768 & 97,595,492  & 8,493,633 \\
-Exptmod & 1024 & 223,302,532  & 20,624,553     \\
+Exptmod & 1024 & 223,302,532  & 17,715,899     \\
 Exptmod & 2048 & 1,682,223,369   & 114,936,361      \\
 Exptmod & 2560 & 3,268,615,571   & 229,402,426       \\
 Exptmod & 3072 & 5,597,240,141   & 367,403,840      \\
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,8 @@
 Jan 9th, 2003
 v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
 Jan 6th, 2003
 v0.09  -- Updated the manual to reflect recent changes.  :-)
       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
--- a/demo.c
+++ b/demo.c
@ -21,12 +21,15 @@
 #define TIMER
 extern ulong64 rdtsc(void);
 extern void reset(void);
-#else 
+#endif
 #ifdef TIMER
 #ifndef TIMER_X86
 ulong64 _tt;
 void reset(void) { _tt = clock(); }
 ulong64 rdtsc(void) { return clock() - _tt; }
 #endif
 #endif
 #ifndef DEBUG
 int _ifuncs;
@ -82,6 +85,7 @@ int main(void)
   mp_int a, b, c, d, e, f;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
   int rr;
   mp_digit tom;
 #ifdef TIMER
   int n;
@ -95,6 +99,28 @@ int main(void)
   mp_init(&e);
   mp_init(&f);
   mp_read_radix(&a, "59994534535345535344389423", 10);
   mp_read_radix(&b, "49993453555234234565675534", 10);
   mp_read_radix(&c, "62398923474472948723847281", 10);
   mp_mulmod(&a, &b, &c, &f);
   /* setup mont */
   mp_montgomery_setup(&c, &tom);
   mp_mul(&a, &b, &a);
   mp_montgomery_reduce(&a, &c, tom);
   mp_montgomery_reduce(&a, &c, tom);
   mp_lshd(&a, c.used*2);
   mp_mod(&a, &c, &a);
   mp_toradix(&a, cmd, 10);
   printf("%s\n\n", cmd);
   mp_toradix(&f, cmd, 10);
   printf("%s\n", cmd);
 /*   return 0; */
   mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
   mp_reduce_setup(&b, &a);
   printf("\n\n----\n\n");
@ -103,20 +129,7 @@ int main(void)
   mp_read_radix(&b, "4982748972349724892742", 10);
   mp_sub_d(&a, 1, &c);
 #ifdef DEBUG
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   mp_sqr(&a, &a);mp_sqr(&c, &c);
   reset_timings();
 #endif   
   mp_exptmod(&b, &c, &a, &d);
 #ifdef DEBUG   
   dump_timings();
   return 0;
 #endif   
   mp_toradix(&d, buf, 10);
   printf("b^p-1 == %s\n", buf);
--- a/etc/pprime.c
+++ b/etc/pprime.c
@ -85,10 +85,11 @@ static mp_digit prime_digit()
 }
 /* makes a prime of at least k bits */
-int pprime(int k, mp_int *p, mp_int *q)
+int pprime(int k, int li, mp_int *p, mp_int *q)
 {
   mp_int a, b, c, n, x, y, z, v;
-   int res;
+   int res, ii;
   static const mp_digit bases[] = { 2, 3, 5, 7, 11, 13, 17, 19 };
   /* single digit ? */
   if (k <= (int)DIGIT_BIT) {
@ -167,55 +168,60 @@ int pprime(int k, mp_int *p, mp_int *q)
      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
-      /* now try base x=2  */
+      /* now try base x=bases[ii]  */
-      mp_set(&x, 2);
+      for (ii = 0; ii < li; ii++) {
         mp_set(&x, bases[ii]);
-      /* compute x^a mod n */
+         /* compute x^a mod n */
-      if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
+         if ((res = mp_exptmod(&x, &a, &n, &y)) != MP_OKAY) {             /* y = x^a mod n */
-         goto __Z;
+            goto __Z;
         }
         /* if y == 1 loop */
         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
         /* now x^2a mod n */
         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
           goto __Z;
         }
         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
         /* compute x^b mod n */
         if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
            goto __Z;
         }
         /* if y == 1 loop */
         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
         /* now x^2b mod n */
         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
            goto __Z;
         }
         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
         /* compute x^c mod n == x^ab mod n */
         if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
             goto __Z;
         }
         /* if y == 1 loop */
         if (mp_cmp_d(&y, 1) == MP_EQ) continue;
         /* now compute (x^c mod n)^2 */
         if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
            goto __Z;
         }
         /* y should be 1 */
         if (mp_cmp_d(&y, 1) != MP_EQ) continue;
         break;
      }
-      /* if y == 1 loop */
+      /* no bases worked? */
-      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
+      if (ii == li) goto top;
      /* now x^2a mod n */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2a mod n */
         goto __Z;
      }
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* compute x^b mod n */
      if ((res = mp_exptmod(&x, &b, &n, &y)) != MP_OKAY) {             /* y = x^b mod n */
          goto __Z;
      }
      /* if y == 1 loop */
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* now x^2b mod n */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2b mod n */
         goto __Z;
      }
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* compute x^c mod n == x^ab mod n */
      if ((res = mp_exptmod(&x, &c, &n, &y)) != MP_OKAY) {             /* y = x^ab mod n */
          goto __Z;
      }
      /* if y == 1 loop */
      if (mp_cmp_d(&y, 1) == MP_EQ) goto top;
      /* now compute (x^c mod n)^2 */
      if ((res = mp_sqrmod(&y, &n, &y)) != MP_OKAY) {                  /* y = x^2ab mod n */
         goto __Z;
      }
      /* y should be 1 */
      if (mp_cmp_d(&y, 1) != MP_EQ) goto top;
 /*
 {
@ -234,8 +240,12 @@ int pprime(int k, mp_int *p, mp_int *q)
     mp_copy(&n, &a);
  }
  /* get q to be the order of the large prime subgroup */
  mp_sub_d(&n, 1, q);
  mp_div_2(q, q);
  mp_div(q, &b, q, NULL);
  mp_exch(&n, p);
  mp_exch(&b, q);
   res = MP_OKAY;
 __Z: mp_clear(&z); 
@ -254,19 +264,25 @@ int main(void)
 {
   mp_int p, q;
   char buf[4096];
-   int k;
+   int k, li;
   clock_t t1;
   srand(time(NULL));
   printf("Enter # of bits: \n");
-   scanf("%d", &k);
+   fgets(buf, sizeof(buf), stdin);
   sscanf(buf, "%d", &k);
   printf("Enter number of bases to try (1 to 8):\n");
   fgets(buf, sizeof(buf), stdin);
   sscanf(buf, "%d", &li);
   mp_init(&p);
   mp_init(&q);
   t1 = clock();   
-   pprime(k, &p, &q);
+   pprime(k, li, &p, &q);
   t1 = clock() - t1;
   printf("\n\nTook %ld ticks, %d bits\n", t1, mp_count_bits(&p));
--- a/2
+++ b/2
@ -1,7 +1,7 @@
 CC = gcc
 CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops 
-VERSION=0.09
+VERSION=0.10
 default: test
--- a/poly.c
+++ b/poly.c
@ -0,0 +1,302 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision 
 * integer arithmetic as well as number theoretic functionality.
 *
 * This file "poly.c" provides GF(p^k) functionality on top of the 
 * libtommath library.
 * 
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with 
 * additional optimizations in place.  
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
 */
 #include "poly.h"
 #undef MIN
 #define MIN(x,y) ((x)<(y)?(x):(y))
 #undef MAX
 #define MAX(x,y) ((x)>(y)?(x):(y))
 static void s_free(mp_poly *a)
 {
   int k;
   for (k = 0; k < a->alloc; k++) {
       mp_clear(&(a->co[k]));
   }
 }
 static int s_setup(mp_poly *a, int low, int high)
 {
   int res, k, j;
   for (k = low; k < high; k++) {
       if ((res = mp_init(&(a->co[k]))) != MP_OKAY) {
          for (j = low; j < k; j++) {
             mp_clear(&(a->co[j]));
          }
          return MP_MEM;
       }
   }
   return MP_OKAY;
 }   
 int mp_poly_init(mp_poly *a, mp_int *cha)
 {
   return mp_poly_init_size(a, cha, MP_POLY_PREC);
 }
 /* init a poly of a given (size) degree */
 int mp_poly_init_size(mp_poly *a, mp_int *cha, int size)
 {
   int res;
   /* allocate array of mp_ints for coefficients */
   a->co = malloc(size * sizeof(mp_int));
   if (a->co == NULL) {
      return MP_MEM;
   }
   a->used  = 0;
   a->alloc = size;
   /* now init the range */
   if ((res = s_setup(a, 0, size)) != MP_OKAY) {
      free(a->co);
      return res;
   }
   /* copy characteristic */
   if ((res = mp_init_copy(&(a->cha), cha)) != MP_OKAY) {
      s_free(a);
      free(a->co);
      return res;
   }
   /* return ok at this point */
   return MP_OKAY;
 }
 /* grow the size of a poly */
 static int mp_poly_grow(mp_poly *a, int size)
 {
  int res;
  if (size > a->alloc) {
     /* resize the array of coefficients */
     a->co = realloc(a->co, sizeof(mp_int) * size);
     if (a->co == NULL) {
        return MP_MEM;
     }
     /* now setup the coefficients */
     if ((res = s_setup(a, a->alloc, a->alloc + size)) != MP_OKAY) {
        return res;
     }
     a->alloc += size;
  }
  return MP_OKAY;
 }
 /* copy, b = a */
 int mp_poly_copy(mp_poly *a, mp_poly *b)
 {
   int res, k;
   /* resize b */
   if ((res = mp_poly_grow(b, a->used)) != MP_OKAY) {
      return res;
   }
   /* now copy the used part */
   b->used = a->used;
   /* now the cha */
   if ((res = mp_copy(&(a->cha), &(b->cha))) != MP_OKAY) {
      return res;
   }
   /* now all the coefficients */
   for (k = 0; k < b->used; k++) {
       if ((res = mp_copy(&(a->co[k]), &(b->co[k]))) != MP_OKAY) {
          return res;
       }
   }
   /* now zero the top */
   for (k = b->used; k < b->alloc; k++) {
       mp_zero(&(b->co[k]));
   }
   return MP_OKAY;
 }
 /* init from a copy, a = b */
 int mp_poly_init_copy(mp_poly *a, mp_poly *b)
 {
   int res;
   if ((res = mp_poly_init(a, &(b->cha))) != MP_OKAY) {
      return res;
   }
   return mp_poly_copy(b, a);
 }
 /* free a poly from ram */
 void mp_poly_clear(mp_poly *a)
 {
   s_free(a);
   mp_clear(&(a->cha));
   free(a->co);
   a->co = NULL;
   a->used = a->alloc = 0;
 }
 /* exchange two polys */
 void mp_poly_exch(mp_poly *a, mp_poly *b)
 {
   mp_poly t;
   t = *a; *a = *b; *b = t;
 }
 /* clamp the # of used digits */
 static void mp_poly_clamp(mp_poly *a)
 {
   while (a->used > 0 && mp_cmp_d(&(a->co[a->used]), 0) == MP_EQ) --(a->used);
 }  
 /* add two polynomials, c(x) = a(x) + b(x) */
 int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c)
 {
   mp_poly t, *x, *y;
   int res, k;
   /* ensure char's are the same */
   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
      return MP_VAL;
   }
   /* now figure out the sizes such that x is the 
      largest degree poly and y is less or equal in degree 
    */
   if (a->used > b->used) {
      x = a;
      y = b;
   } else {
      x = b;
      y = a;
   }
   /* now init the result to be a copy of the largest */
   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
      return res;
   }
   /* now add the coeffcients of the smaller one */
   for (k = 0; k < y->used; k++) {
       if ((res = mp_addmod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
          goto __T;
       }
   }
   mp_poly_clamp(&t);
   mp_poly_exch(&t, c);
   res = MP_OKAY;
 __T:  mp_poly_clear(&t);
   return res;
 }
 /* subtracts two polynomials, c(x) = a(x) - b(x) */
 int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c)
 {
   mp_poly t, *x, *y;
   int res, k;
   /* ensure char's are the same */
   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
      return MP_VAL;
   }
   /* now figure out the sizes such that x is the 
      largest degree poly and y is less or equal in degree 
    */
   if (a->used > b->used) {
      x = a;
      y = b;
   } else {
      x = b;
      y = a;
   }
   /* now init the result to be a copy of the largest */
   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
      return res;
   }
   /* now add the coeffcients of the smaller one */
   for (k = 0; k < y->used; k++) {
       if ((res = mp_submod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
          goto __T;
       }
   }
   mp_poly_clamp(&t);
   mp_poly_exch(&t, c);
   res = MP_OKAY;
 __T:  mp_poly_clear(&t);
   return res;
 }
 /* multiplies two polynomials, c(x) = a(x) * b(x) */
 int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c)
 {
   mp_poly t;
   mp_int  tt;
   int res, pa, pb, ix, iy;
   /* ensure char's are the same */
   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
      return MP_VAL;
   }
   /* degrees of a and b */
   pa = a->used;
   pb = b->used;
   /* now init the temp polynomial to be of degree pa+pb */
   if ((res = mp_poly_init_size(&t, &(a->cha), pa+pb)) != MP_OKAY) {
      return res;
   }
   /* now init our temp int */
   if ((res = mp_init(&tt)) != MP_OKAY) {
      goto __T;
   }
   /* now loop through all the digits */
   for (ix = 0; ix < pa; ix++) {
       for (iy = 0; iy < pb; iy++) {
          if ((res = mp_mul(&(a->co[ix]), &(b->co[iy]), &tt)) != MP_OKAY) {
             goto __TT;
          }
          if ((res = mp_addmod(&tt, &(t.co[ix+iy]), &(a->cha), &(t.co[ix+iy]))) != MP_OKAY) {
             goto __TT;
          }
       }
   }
   mp_poly_clamp(&t);
   mp_poly_exch(&t, c);
   res = MP_OKAY;
 __TT: mp_clear(&tt);
 __T:  mp_poly_clear(&t);
   return res;
 }
--- a/poly.h
+++ b/poly.h
@ -0,0 +1,73 @@
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is library that provides for multiple-precision 
 * integer arithmetic as well as number theoretic functionality.
 *
 * This file "poly.h" provides GF(p^k) functionality on top of the 
 * libtommath library.
 * 
 * The library is designed directly after the MPI library by
 * Michael Fromberger but has been written from scratch with 
 * additional optimizations in place.  
 *
 * The library is free for all purposes without any express
 * guarantee it works.
 *
 * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
 */
 #ifndef POLY_H_
 #define POLY_H_
 #include "bn.h"
 /* a mp_poly is basically a derived "class" of a mp_int
 * it uses the same technique of growing arrays via 
 * used/alloc parameters except the base unit or "digit"
 * is in fact a mp_int.  These hold the coefficients
 * of the polynomial 
 */
 typedef struct {
    int    used,    /* coefficients used */
           alloc;   /* coefficients allocated (and initialized) */
    mp_int *co,     /* coefficients */
           cha;     /* characteristic */
 } mp_poly;
 #define MP_POLY_PREC     16             /* default coefficients allocated */
 /* init a poly */
 int mp_poly_init(mp_poly *a, mp_int *cha);
 /* init a poly of a given (size) degree  */
 int mp_poly_init_size(mp_poly *a, mp_int *cha, int size);
 /* copy, b = a */
 int mp_poly_copy(mp_poly *a, mp_poly *b);
 /* init from a copy, a = b */
 int mp_poly_init_copy(mp_poly *a, mp_poly *b);
 /* free a poly from ram */
 void mp_poly_clear(mp_poly *a);
 /* exchange two polys */
 void mp_poly_exch(mp_poly *a, mp_poly *b);
 /* ---> Basic Arithmetic <--- */
 /* add two polynomials, c(x) = a(x) + b(x) */
 int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c);
 /* subtracts two polynomials, c(x) = a(x) - b(x) */
 int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c);
 /* multiplies two polynomials, c(x) = a(x) * b(x) */
 int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c);
 #endif