added libtommath-0.11

2003-02-28 16:07:58 +00:00 · 2003-02-28 16:07:58 +00:00 · 33c5019985
commit 33c5019985
parent fb93a30a25
9 changed files with 885 additions and 828 deletions
--- a/b.bat
+++ b/b.bat
@ -1,3 +1,2 @@
-nasm -f coff timer.asm
+nasm -f elf timer.asm
 gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
-rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
--- a/bn.c
+++ b/bn.c
@ -99,6 +99,7 @@ void dump_timings(void)
   memset(&functime, 0, sizeof(functime));
   total = 0;
   for (x = 0; x < _itims; x++) {
+       if (strcmp(timings[x].func, "_verify")) 
          total += timings[x].tot;
       
       /* try to find this entry */
@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
   
   /* clear unused */
-   for (ix = c->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
      c->dp[ix] = 0;
   }
  
@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
   c->used = newused;
   
   /* now convert the array W downto what we need */
-   for (ix = digs+1; ix < (pa+pb+1); ix++) {
+   for (ix = digs+1; ix < newused; ix++) {
       W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
       c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
   c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
   
-   for (ix = c->used; ix < oldused; ix++) {
+   for (; ix < oldused; ix++) {
      c->dp[ix] = 0;
   }
   mp_clamp(c);
@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
   b->used = newused;
   
   /* now compute digits */
-   for (ix = 1; ix < (pa+pa+1); ix++) {
+   for (ix = 1; ix < newused; ix++) {
       /* double/add next digit */
       W[ix]       += W[ix] + W2[ix];

       W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
       b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
   }
-   b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
+   b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
   
   /* clear high */
-   for (ix = b->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
       b->dp[ix] = 0;
   }
   
@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
   }
   
   mp_clamp(&x0);
-   mp_clamp(&x1);
   mp_clamp(&y0);
-   mp_clamp(&y1);
   
   /* now calc the products x0y0 and x1y1 */
   if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */
@ -1679,7 +1678,6 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
   x1.used = a->used - B;
   
   mp_clamp(&x0);
-   mp_clamp(&x1);
   
   /* now calc the products x0*x0 and x1*x1 */
   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                  /* x0x0 = x0*x0 */
@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
   VERIFY(a);
   VERIFY(b);
   
-   mp_set(a, 1);
-   if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
+   if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
      DECFUNC();
      return res;
   }
@ -2876,7 +2873,6 @@ __T:  mp_clear(&t);
   return res;
 }   

-
 /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
 static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 {
@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
   mp_digit ui;
   mp_word  W[512];
   
+   REGFUNC("fast_mp_montgomery_reduce");
+   VERIFY(a);
+   VERIFY(m);
+   
   /* get old used count */
   olduse = a->used;
   
   /* grow a as required */
-   if (a->alloc < m->used*2+1) {
-      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
+   if (a->alloc < m->used+1) {
+      if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
+         DECFUNC();
         return res;
      }
   }
   
-   /* copy and clear */
+   /* copy the digits of a */
   for (ix = 0; ix < a->used; ix++) {
       W[ix] = a->dp[ix];
   }
+   
+   /* zero the high words */
   for (; ix < m->used * 2 + 1; ix++) {
       W[ix] = 0;
   }
     
   for (ix = 0; ix < m->used; ix++) {
-       /* ui = ai * m' mod b */
+       /* ui = ai * m' mod b 
+        *
+        * We avoid a double precision multiplication (which isn't required)
+        * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+        * the carry cleared (see after the inner loop)
+        */
       ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
       
-       /* a = a + ui * m * b^i */
+       /* a = a + ui * m * b^i 
+        *
+        * This is computed in place and on the fly.  The multiplication 
+        * by b^i is handled by offseting which columns the results 
+        * are added to.
+        *
+        * Note the comba method normally doesn't handle carries in the inner loop
+        * In this case we fix the carry from the previous column since the Montgomery
+        * reduction requires digits of the result (so far) [see above] to work.  This is 
+        * handled by fixing up one carry after the inner loop.  The carry fixups are done
+        * in order so after these loops the first m->used words of W[] have the carries
+        * fixed
+        */       
       { 
          register int      iy;
          register mp_digit *tmpx;
@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
          tmpx = m->dp;
          _W   = W + ix;
          
+          /* inner loop */
          for (iy = 0; iy < m->used; iy++) {
              *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
          }
-          
-          /* now fix carry for W[ix+1] */
-          W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
-          W[ix]   &= ((mp_word)MP_MASK);
       }
+
+       /* now fix carry for next digit, W[ix+1] */
+       W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
   }
   
   /* nox fix rest of carries */
-   for (; ix <= m->used * 2 + 1; ix++) {
+   for (++ix; ix <= m->used * 2 + 1; ix++) {
       W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       W[ix-1] &= ((mp_word)MP_MASK);
   }
   
-   /* copy out */
-
-   /* A = A/b^n */
+   /* copy out, A = A/b^n 
+    *
+    * The result is A/b^n but instead of converting from an array of mp_word
+    * to mp_digit than calling mp_rshd we just copy them in the right
+    * order 
+    */
   for (ix = 0; ix < m->used + 1; ix++) { 
-       a->dp[ix] = W[ix+m->used];
+       a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
   }
   
+   /* set the max used */
   a->used = m->used + 1;

-   /* zero oldused digits */  
+   /* zero oldused digits, if the input a was larger than 
+    * m->used+1 we'll have to clear the digits */  
   for (; ix < olduse; ix++) {
       a->dp[ix] = 0;
   }
@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
   /* if A >= m then A = A - m */
   if (mp_cmp_mag(a, m) != MP_LT) {
      if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
+         DECFUNC();
         return res;
      }
   }   
   
+   DECFUNC();
   return MP_OKAY;
 }

@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 */
 static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res;
+   mp_int M[256], res;
   mp_digit buf, mp;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@ -3048,11 +3074,13 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   
   /* find window size */
   x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }

   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
@ -3072,12 +3100,11 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
   
   /* setup result */
   if ((err = mp_init(&res)) != MP_OKAY) {
-      goto __M;
+      goto __RES;
   }

   /* now we need R mod m */
-   mp_set(&res, 1);           
-   if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
+   if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
      goto __RES;
   }
      
@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
-   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __RES;
   }
@ -3236,10 +3262,9 @@ __M  :
   return err;
 }

-
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res, mu;
+   mp_int M[256], res, mu;
   mp_digit buf;
   int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
   
@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)

   /* find window size */
   x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }
   
   /* init G array */
   for (x = 0; x < (1<<winsize); x++) {
@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    *
    * The first half of the table is not computed though accept for M[0] and M[1]
    */
-   mp_set(&M[0], 1);
   if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
      goto __MU;
   }
@ -3430,6 +3456,22 @@ __M  :
   return err;
 }

+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b)
+{
+   int res;
+   
+   mp_zero(a);
+   if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
+      return res;
+   }
+   a->used = b/DIGIT_BIT + 1;
+   a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+   
+   return MP_OKAY;
+}   
+   
+
 /* find the n'th root of an integer 
 *
 * Result found such that (c)^b <= a and (c+1)^b > a 
--- a/bn.h
+++ b/bn.h
@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
 /* c = a mod 2^d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);

+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b);
+
 /* ---> Basic arithmetic <--- */

 /* b = -a */
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -1,7 +1,7 @@
 \documentclass{article}
 \begin{document}

-\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
+\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
@ -471,7 +471,7 @@ it is not.

 \subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
 Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
-exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
+exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications.  The value of $k$ is
 chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery 
 reductions are used to reduce the squared or multiplied temporary results modulo $c$.

@ -480,7 +480,7 @@ reductions are used to reduce the squared or multiplied temporary results modulo
 \subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
 Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes 
 $a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the 
-function mp\_reduce\_setup().
+function mp\_reduce\_setup().  The modulus $b$ must be larger than zero.

 The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
 have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial
@ -490,16 +490,31 @@ can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single
 \subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
 Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then
 $R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
-The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
+The value of $mp$ is precomputed with the function mp\_montgomery\_setup().  The modulus $b$ must be odd and larger
+than zero.  

 The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine
 is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can 
-reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  
+reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  Compared
+to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.  

 Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply
 by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the
 exptmod function can be made to take advantage of this such that only one normalization at the end is required.

+This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then
+$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$.  The next useful observation is 
+that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with
+a single reduction.  Now a series of complicated modular operations can be optimized if all the variables are initially
+multiplied by $R$ then the final result normalized by performing an extra reduction.
+
+If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$.
+Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction.  This means that
+two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.  
+
+A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which
+requires no single precision multiplications.  
+
 \section{Timing Analysis}
 \subsection{Observed Timings}
 A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,9 @@
+Jan 15th, 2003
+v0.11  -- More subtle fixes
+       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
+       -- Sped up the montgomery reduction code quite a bit
+       -- fixed up demo so when building timing for the x86 it assumes ELF format now
+       
 Jan 9th, 2003
 v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
--- a/demo.c
+++ b/demo.c
@ -19,8 +19,10 @@

 #ifdef TIMER_X86
 #define TIMER
-extern ulong64 rdtsc(void);
-extern void reset(void);
+extern ulong64 _rdtsc(void);
+extern void _reset(void);
+ulong64 rdtsc(void) { return _rdtsc(); }
+void reset(void) { _reset(); }
 #endif

 #ifdef TIMER
@ -85,7 +87,6 @@ int main(void)
   mp_int a, b, c, d, e, f;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
   int rr;
-   mp_digit tom;
   
 #ifdef TIMER
   int n;
@ -99,42 +100,33 @@ int main(void)
   mp_init(&e);
   mp_init(&f);
   
-   mp_read_radix(&a, "59994534535345535344389423", 10);
-   mp_read_radix(&b, "49993453555234234565675534", 10);
-   mp_read_radix(&c, "62398923474472948723847281", 10);
-    
-   mp_mulmod(&a, &b, &c, &f);
-   
-   /* setup mont */
-   mp_montgomery_setup(&c, &tom);
-   mp_mul(&a, &b, &a);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_lshd(&a, c.used*2);
-   mp_mod(&a, &c, &a);
-   
-   mp_toradix(&a, cmd, 10);
-   printf("%s\n\n", cmd);
-   mp_toradix(&f, cmd, 10);
-   printf("%s\n", cmd);
-   
-/*   return 0; */
-   
-   
-   mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
-   mp_reduce_setup(&b, &a);
-   printf("\n\n----\n\n");
-   mp_toradix(&b, buf, 10);
-   printf("b == %s\n\n\n", buf);
-
-   mp_read_radix(&b, "4982748972349724892742", 10);
-   mp_sub_d(&a, 1, &c);
-   mp_exptmod(&b, &c, &a, &d);
-   mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf);
-   
+#ifdef DEBUG
+   mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
+   mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
+   mp_set(&c, 1);
+   reset_timings();
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   dump_timings();
+   return 0;
+#endif   
      
 #ifdef TIMER      
+goto expt;
      mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
      mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
      while (a.used * DIGIT_BIT < 8192) {
@ -182,7 +174,7 @@ int main(void)
      printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
      mp_copy(&b, &a);
   }
-
+expt:
   {
      char *primes[] = {
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@ -206,7 +198,7 @@ int main(void)
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
      reset();
-      for (rr = 0; rr < 35; rr++) {
+      for (rr = 0; rr < 100; rr++) {
          mp_exptmod(&c, &b, &a, &d);
      }
      tt = rdtsc();
@ -219,7 +211,7 @@ int main(void)
         draw(&d);
         exit(0);
      }
-      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
+      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
   }
   }   

--- a/8
+++ b/8
@ -1,13 +1,13 @@
 CC = gcc
 CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops

-VERSION=0.10
+VERSION=0.11

 default: test

 test: bn.o demo.o
 	$(CC) bn.o demo.o -o demo
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s

 # builds the x86 demo
 test86:
@ -22,9 +22,9 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s 
+	rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne

 zipup: clean docs
-	chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
+	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
 	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*