added libtommath-0.31

2004-08-09 22:15:59 +00:00 · 2004-08-09 22:15:59 +00:00 · 8eaa98807b
commit 8eaa98807b
parent 350578d400
75 changed files with 5111 additions and 5218 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.30}
+\title{LibTomMath User Manual \\ v0.31}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@ -88,7 +88,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  }

  /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
  c->used = digs;

  {
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@ -36,7 +36,7 @@ mp_2expt (mp_int * a, int b)
  a->used = b / DIGIT_BIT + 1;

  /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);

  return MP_OKAY;
 }
--- a/bn_mp_clear.c
+++ b/bn_mp_clear.c
@ -18,10 +18,14 @@
 void
 mp_clear (mp_int * a)
 {
+  int i;
+
  /* only do anything if a hasn't been freed previously */
  if (a->dp != NULL) {
    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
+        a->dp[i] = 0;
+    }

    /* free ram */
    XFREE(a->dp);
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@ -187,7 +187,7 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   */
  
  /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;

  if (c != NULL) {
    mp_clamp (&q);
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -14,15 +14,22 @@
 */
 #include <tommath.h>

-/* init a new bigint */
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
+  int i;
+
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }

+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -76,9 +76,6 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    goto X0Y0;

  /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
  x0.used = y0.used = B;
  x1.used = a->used - B;
  y1.used = b->used - B;
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -43,6 +43,6 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
      res = s_mp_mul (a, b, c);
    }
  }
-  c->sign = neg;
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
  return res;
 }
--- a/bn_mp_reduce_is_2k.c
+++ b/bn_mp_reduce_is_2k.c
@ -17,7 +17,8 @@
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
+   mp_digit iz;
   
   if (a->used == 0) {
      return 0;
@ -34,7 +35,7 @@ int mp_reduce_is_2k(mp_int *a)
             return 0;
          }
          iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
             ++iw;
             iz = 1;
          }
--- a/bncore.c
+++ b/bncore.c
@ -18,14 +18,16 @@

 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
- AMD Athlon XP          /GCC v3.2     /       109/       127
-
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        81/profiled build
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        80/profiled_single build
+ Intel P4 Northwood     /ICC v8.0     /        57/        70/profiled build
+ Intel P4 Northwood     /ICC v8.0     /        54/        76/profiled_single build
+ AMD Athlon XP          /GCC v3.2     /       109/       127/
+ 
 */

-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 57,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 70,      /* Min. number of digits before Karatsuba squaring is used. */
        
        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
        TOOM_SQR_CUTOFF      = 400; 
--- a/booker.pl
+++ b/booker.pl
@ -84,6 +84,7 @@ while (<IN>) {
            $text[$line++] = $_;
            last if ($_ =~ /tommath\.h/);
         }
+         <SRC>;   
      }
      
      $inline = 0;
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,12 @@
+August 9th, 2004
+v0.31  -- "profiled" builds now :-) new timings for Intel Northwoods
+       -- Added "pretty" build target
+       -- Update mp_init() to actually assign 0's instead of relying on calloc()
+       -- "Wolfgang Ehrhardt" <Wolfgang.Ehrhardt@munich.netsurf.de> found a bug in mp_mul() where if
+          you multiply a negative by zero you get negative zero as the result.  Oops.
+       -- J Harper from PeerSec let me toy with his AMD64 and I got 60-bit digits working properly
+          [this also means that I fixed a bug where if sizeof(int) < sizeof(mp_digit) it would bug]
+
 April 11th, 2004
 v0.30  -- Added "mp_toradix_n" which stores upto "n-1" least significant digits of an mp_int
       -- Johan Lindh sent a patch so MSVC wouldn't whine about redefining malloc [in weird dll modes]
--- a/demo/demo.c
+++ b/demo/demo.c
@ -1,7 +1,5 @@
 #include <time.h>

-#define TESTING
-
 #ifdef IOWNANATHLON
 #include <unistd.h>
 #define SLEEP sleep(4)
@ -11,49 +9,6 @@

 #include "tommath.h"

-#ifdef TIMER
-ulong64 _tt;
-
-#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
-/* RDTSC from Scott Duplichan */
-static ulong64 TIMFUNC (void)
-   {
-   #if defined __GNUC__
-      #ifdef __i386__
-         ulong64 a;
-         __asm__ __volatile__ ("rdtsc ":"=A" (a));
-         return a;
-      #else /* gcc-IA64 version */
-         unsigned long result;
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         while (__builtin_expect ((int) result == -1, 0))
-         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
-         return result;
-      #endif
-
-   // Microsoft and Intel Windows compilers
-   #elif defined _M_IX86
-     __asm rdtsc
-   #elif defined _M_AMD64
-     return __rdtsc ();
-   #elif defined _M_IA64
-     #if defined __INTEL_COMPILER
-       #include <ia64intrin.h>
-     #endif
-      return __getReg (3116);
-   #else
-     #error need rdtsc function for this build
-   #endif
-   }
-#else
-#define TIMFUNC clock
-#endif
-
-ulong64 rdtsc(void) { return TIMFUNC() - _tt; }
-void reset(void) { _tt = TIMFUNC(); }
-
-#endif
-
 void ndraw(mp_int *a, char *name)
 {
   char buf[4096];
@ -89,10 +44,6 @@ int myrng(unsigned char *dst, int len, void *dat)
 }


-#define DO2(x) x; x;
-#define DO4(x) DO2(x); DO2(x);
-#define DO8(x) DO4(x); DO4(x);
-#define DO(x)  DO8(x); DO8(x);

   char cmd[4096], buf[4096];
 int main(void)
@ -103,10 +54,6 @@ int main(void)
   unsigned rr;
   int i, n, err, cnt, ix, old_kara_m, old_kara_s;

-#ifdef TIMER
-   ulong64 tt, CLK_PER_SEC;
-   FILE *log, *logb, *logc;
-#endif

   mp_init(&a);
   mp_init(&b);
@ -117,11 +64,10 @@ int main(void)

   srand(time(NULL));

-#ifdef TESTING
  // test mp_get_int
  printf("Testing: mp_get_int\n");
  for(i=0;i<1000;++i) {
-    t = (unsigned long)rand()*rand()+1;
+    t = ((unsigned long)rand()*rand()+1)&0xFFFFFFFF;
    mp_set_int(&a,t);
    if (t!=mp_get_int(&a)) { 
      printf("mp_get_int() bad result!\n");
@ -141,7 +87,7 @@ int main(void)

  // test mp_sqrt
  printf("Testing: mp_sqrt\n");
-  for (i=0;i<10000;++i) { 
+  for (i=0;i<1000;++i) { 
    printf("%6d\r", i); fflush(stdout);
    n = (rand()&15)+1;
    mp_rand(&a,n);
@ -157,7 +103,7 @@ int main(void)
  }

  printf("\nTesting: mp_is_square\n");
-  for (i=0;i<100000;++i) {
+  for (i=0;i<1000;++i) {
    printf("%6d\r", i); fflush(stdout);

    /* test mp_is_square false negatives */
@ -186,11 +132,9 @@ int main(void)

  }
  printf("\n\n");
-#endif

-#ifdef TESTING 
   /* test for size */
-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 10; ix < 256; ix++) {
       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
       err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
       if (err != MP_OKAY) {
@ -203,7 +147,7 @@ int main(void)
       }
   }

-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 16; ix < 256; ix++) {
       printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
       err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
       if (err != MP_OKAY) {
@ -225,9 +169,7 @@ int main(void)
   }

   printf("\n\n");
-#endif

-#ifdef TESTING
   mp_read_radix(&a, "123456", 10);
   mp_toradix_n(&a, buf, 10, 3);
   printf("a == %s\n", buf);
@ -235,7 +177,6 @@ int main(void)
   printf("a == %s\n", buf);
   mp_toradix_n(&a, buf, 10, 30);
   printf("a == %s\n", buf);
-#endif


 #if 0
@ -248,22 +189,6 @@ int main(void)
   }
 #endif

-#if 0
-{
-   mp_word aa, bb;
-
-   for (;;) {
-       aa = abs(rand()) & MP_MASK;
-       bb = abs(rand()) & MP_MASK;
-      if (MULT(aa,bb) != (aa*bb)) {
-             printf("%llu * %llu == %llu or %llu?\n", aa, bb, (ulong64)MULT(aa,bb), (ulong64)(aa*bb));
-             return 0;
-          }
-   }
-}
-#endif
-
-#ifdef TESTING
   /* test mp_cnt_lsb */
   printf("testing mp_cnt_lsb...\n");
   mp_set(&a, 1);
@ -274,12 +199,10 @@ int main(void)
       }
       mp_mul_2(&a, &a);
   }
-#endif

 /* test mp_reduce_2k */
-#ifdef TESTING
   printf("Testing mp_reduce_2k...\n");
-   for (cnt = 3; cnt <= 384; ++cnt) {
+   for (cnt = 3; cnt <= 128; ++cnt) {
       mp_digit tmp;
       mp_2expt(&a, cnt);
       mp_sub_d(&a, 2, &a);  /* a = 2**cnt - 2 */
@ -289,7 +212,7 @@ int main(void)
       printf("(%d)", mp_reduce_is_2k(&a));
       mp_reduce_2k_setup(&a, &tmp);
       printf("(%d)", tmp);
-       for (ix = 0; ix < 10000; ix++) {
+       for (ix = 0; ix < 1000; ix++) {
           if (!(ix & 127)) {printf("."); fflush(stdout); }
           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
           mp_copy(&c, &b);
@ -301,14 +224,11 @@ int main(void)
           }
        }
    }
-#endif
-

 /* test mp_div_3  */
-#ifdef TESTING
   printf("Testing mp_div_3...\n");
   mp_set(&d, 3);
-   for (cnt = 0; cnt < 1000000; ) {
+   for (cnt = 0; cnt < 10000; ) {
      mp_digit r1, r2;

      if (!(++cnt & 127)) printf("%9d\r", cnt);
@ -321,12 +241,10 @@ int main(void)
      }
   }
   printf("\n\nPassed div_3 testing\n");
-#endif

 /* test the DR reduction */
-#ifdef TESTING
   printf("testing mp_dr_reduce...\n");
-   for (cnt = 2; cnt < 128; cnt++) {
+   for (cnt = 2; cnt < 32; cnt++) {
       printf("%d digit modulus\n", cnt);
       mp_grow(&a, cnt);
       mp_zero(&a);
@ -334,7 +252,7 @@ int main(void)
           a.dp[ix] = MP_MASK;
       }
       a.used = cnt;
-       mp_prime_next_prime(&a, 3, 0);
+       a.dp[0] = 3;

       mp_rand(&b, cnt - 1);
       mp_copy(&b, &c);
@ -346,206 +264,16 @@ int main(void)
         mp_copy(&b, &c);

         mp_mod(&b, &a, &b);
-         mp_dr_reduce(&c, &a, (1<<DIGIT_BIT)-a.dp[0]);
+         mp_dr_reduce(&c, &a, (((mp_digit)1)<<DIGIT_BIT)-a.dp[0]);

         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);

         }
-      } while (++rr < 100000);
+      } while (++rr < 500);
      printf("Passed DR test for %d digits\n", cnt);
   }
-#endif

-#ifdef TIMER
-      /* temp. turn off TOOM */
-      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
-
-      reset();
-      sleep(1);
-      CLK_PER_SEC = rdtsc();
-
-      printf("CLK_PER_SEC == %lu\n", CLK_PER_SEC);
-      
-
-      log = fopen("logs/add.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_add(&a,&b,&c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt); fflush(log);
-      }
-      fclose(log);
-
-      log = fopen("logs/sub.log", "w");
-      for (cnt = 8; cnt <= 128; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_sub(&a,&b,&c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-   /* do mult/square twice, first without karatsuba and second with */
-mult_test:   
-   old_kara_m = KARATSUBA_MUL_CUTOFF;
-   old_kara_s = KARATSUBA_SQR_CUTOFF;
-   for (ix = 0; ix < 2; ix++) {
-      printf("With%s Karatsuba\n", (ix==0)?"out":"");
-
-      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
-      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
-
-      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         mp_rand(&b, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_mul(&a, &b, &c));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 8) {
-         SLEEP;
-         mp_rand(&a, cnt);
-         reset();
-         rr = 0;
-         do {
-            DO(mp_sqr(&a, &b));
-            rr += 16;
-         } while (rdtsc() < (CLK_PER_SEC * 2));
-         tt = rdtsc();
-         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
-      }
-      fclose(log);
-
-   }
-expt_test:
-  {
-      char *primes[] = {
-         /* 2K moduli mersenne primes */
-         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
-         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
-         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
-         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
-         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
-         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
-
-         /* DR moduli */
-         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
-         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
-         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
-         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
-         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
-         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
-         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
-
-         /* generic unrestricted moduli */
-         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
-         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
-         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
-         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
-         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
-         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
-         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
-         NULL
-      };
-   log = fopen("logs/expt.log", "w");
-   logb = fopen("logs/expt_dr.log", "w");
-   logc = fopen("logs/expt_2k.log", "w");
-   for (n = 0; primes[n]; n++) {
-      SLEEP;
-      mp_read_radix(&a, primes[n], 10);
-      mp_zero(&b);
-      for (rr = 0; rr < mp_count_bits(&a); rr++) {
-         mp_mul_2(&b, &b);
-         b.dp[0] |= lbit();
-         b.used  += 1;
-      }
-      mp_sub_d(&a, 1, &c);
-      mp_mod(&b, &c, &b);
-      mp_set(&c, 3);
-      reset();
-      rr = 0;
-      do {
-         DO(mp_exptmod(&c, &b, &a, &d));
-         rr += 16;
-      } while (rdtsc() < (CLK_PER_SEC * 2));
-      tt = rdtsc();
-      mp_sub_d(&a, 1, &e);
-      mp_sub(&e, &b, &b);
-      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
-      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
-      if (mp_cmp_d(&d, 1)) {
-         printf("Different (%d)!!!\n", mp_count_bits(&a));
-         draw(&d);
-         exit(0);
-      }
-      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);
-   }
-   }
-   fclose(log);
-   fclose(logb);
-   fclose(logc);
-
-   log = fopen("logs/invmod.log", "w");
-   for (cnt = 4; cnt <= 128; cnt += 4) {
-      SLEEP;
-      mp_rand(&a, cnt);
-      mp_rand(&b, cnt);
-
-      do {
-         mp_add_d(&b, 1, &b);
-         mp_gcd(&a, &b, &c);
-      } while (mp_cmp_d(&c, 1) != MP_EQ);
-
-      reset();
-      rr = 0;
-      do {
-         DO(mp_invmod(&b, &a, &c));
-         rr += 16;
-      } while (rdtsc() < (CLK_PER_SEC * 2));
-      tt = rdtsc();
-      mp_mulmod(&b, &c, &a, &d);
-      if (mp_cmp_d(&d, 1) != MP_EQ) {
-         printf("Failed to invert\n");
-         return 0;
-      }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
-      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);
-   }
-   fclose(log);
-
-   return 0;
-
-#endif

   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
--- a/demo/timing.c
+++ b/demo/timing.c
@ -0,0 +1,291 @@
+#include <tommath.h>
+#include <time.h>
+
+ulong64 _tt;
+
+#ifdef IOWNANATHLON
+#include <unistd.h>
+#define SLEEP sleep(4)
+#else
+#define SLEEP
+#endif
+
+
+void ndraw(mp_int *a, char *name)
+{
+   char buf[4096];
+   printf("%s: ", name);
+   mp_toradix(a, buf, 64);
+   printf("%s\n", buf);
+}
+
+static void draw(mp_int *a)
+{
+   ndraw(a, "");
+}
+
+
+unsigned long lfsr = 0xAAAAAAAAUL;
+
+int lbit(void)
+{
+   if (lfsr & 0x80000000UL) {
+      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
+      return 1;
+   } else {
+      lfsr <<= 1;
+      return 0;
+   }
+}
+
+#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #ifdef __i386__
+         ulong64 a;
+         __asm__ __volatile__ ("rdtsc ":"=A" (a));
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+#else
+#define TIMFUNC clock
+#endif
+
+#define DO(x) x; x;
+//#define DO4(x) DO2(x); DO2(x);
+//#define DO8(x) DO4(x); DO4(x);
+//#define DO(x)  DO8(x); DO8(x);
+
+int main(void)
+{
+   ulong64 tt, gg, CLK_PER_SEC;
+   FILE *log, *logb, *logc;
+   mp_int a, b, c, d, e, f;
+   int n, cnt, ix, old_kara_m, old_kara_s;
+   unsigned rr;
+
+   mp_init(&a);
+   mp_init(&b);
+   mp_init(&c);
+   mp_init(&d);
+   mp_init(&e);
+   mp_init(&f);
+
+   srand(time(NULL));
+ 
+
+      /* temp. turn off TOOM */
+      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
+
+      CLK_PER_SEC = TIMFUNC();
+      sleep(1);
+      CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
+
+      printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
+      
+      log = fopen("logs/add.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_add(&a,&b,&c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100000);
+         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt); fflush(log);
+      }
+      fclose(log);
+
+      log = fopen("logs/sub.log", "w");
+      for (cnt = 8; cnt <= 128; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_sub(&a,&b,&c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100000);
+
+         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);  fflush(log);
+      }
+      fclose(log);
+
+   /* do mult/square twice, first without karatsuba and second with */
+   old_kara_m = KARATSUBA_MUL_CUTOFF;
+   old_kara_s = KARATSUBA_SQR_CUTOFF;
+   for (ix = 0; ix < 1; ix++) {
+      printf("With%s Karatsuba\n", (ix==0)?"out":"");
+
+      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
+      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
+
+      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         mp_rand(&b, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_mul(&a, &b, &c));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100);
+         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      }
+      fclose(log);
+
+      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
+      for (cnt = 32; cnt <= 288; cnt += 8) {
+         SLEEP;
+         mp_rand(&a, cnt);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_sqr(&a, &b));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 100);
+         printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
+      }
+      fclose(log);
+
+   }
+
+  {
+      char *primes[] = {
+         /* 2K moduli mersenne primes */
+         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
+         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
+         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
+         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
+         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
+         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
+
+         /* DR moduli */
+         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
+         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
+         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
+         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
+         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
+         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
+         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
+
+         /* generic unrestricted moduli */
+         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
+         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
+         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
+         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
+         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
+         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
+         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
+         NULL
+      };
+   log = fopen("logs/expt.log", "w");
+   logb = fopen("logs/expt_dr.log", "w");
+   logc = fopen("logs/expt_2k.log", "w");
+   for (n = 0; primes[n]; n++) {
+      SLEEP;
+      mp_read_radix(&a, primes[n], 10);
+      mp_zero(&b);
+      for (rr = 0; rr < (unsigned)mp_count_bits(&a); rr++) {
+         mp_mul_2(&b, &b);
+         b.dp[0] |= lbit();
+         b.used  += 1;
+      }
+      mp_sub_d(&a, 1, &c);
+      mp_mod(&b, &c, &b);
+      mp_set(&c, 3);
+         rr = 0;
+         tt = -1;
+         do {
+            gg = TIMFUNC();
+            DO(mp_exptmod(&c, &b, &a, &d));
+            gg = (TIMFUNC() - gg)>>1;
+            if (tt > gg) tt = gg;
+         } while (++rr < 10);
+      mp_sub_d(&a, 1, &e);
+      mp_sub(&e, &b, &b);
+      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
+      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
+      if (mp_cmp_d(&d, 1)) {
+         printf("Different (%d)!!!\n", mp_count_bits(&a));
+         draw(&d);
+         exit(0);
+      }
+      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), tt);
+   }
+   }
+   fclose(log);
+   fclose(logb);
+   fclose(logc);
+
+   log = fopen("logs/invmod.log", "w");
+   for (cnt = 4; cnt <= 128; cnt += 4) {
+      SLEEP;
+      mp_rand(&a, cnt);
+      mp_rand(&b, cnt);
+
+      do {
+         mp_add_d(&b, 1, &b);
+         mp_gcd(&a, &b, &c);
+      } while (mp_cmp_d(&c, 1) != MP_EQ);
+
+         rr = 0;
+         tt = -1;
+      do {
+         gg = TIMFUNC();
+         DO(mp_invmod(&b, &a, &c));
+         gg = (TIMFUNC() - gg)>>1;
+         if (tt > gg) tt = gg;
+      } while (++rr < 1000);
+      mp_mulmod(&b, &c, &a, &d);
+      if (mp_cmp_d(&d, 1) != MP_EQ) {
+         printf("Failed to invert\n");
+         return 0;
+      }
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
+      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);
+   }
+   fclose(log);
+
+   return 0;
+}
+
--- a/etc/makefile
+++ b/etc/makefile
@ -46,4 +46,5 @@ mont: mont.o

        
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
+         *.da *.dyn *.dpi *~
--- a/etc/makefile.icc
+++ b/etc/makefile.icc
@ -0,0 +1,67 @@
+CC = icc
+
+CFLAGS += -I../
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN -ip
+
+# default lib name (requires install with root)
+# LIBNAME=-ltommath
+
+# libname when you can't install the lib with install
+LIBNAME=../libtommath.a
+
+#provable primes
+pprime: pprime.o
+	$(CC) pprime.o $(LIBNAME) -o pprime
+
+# portable [well requires clock()] tuning app
+tune: tune.o
+	$(CC) tune.o $(LIBNAME) -o tune
+	
+# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
+tune86: tune.c
+	nasm -f coff timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+	
+# for cygwin
+tune86c: tune.c
+	nasm -f gnuwin32 timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
+
+#make tune86 for linux or any ELF format
+tune86l: tune.c
+	nasm -f elf -DUSE_ELF timer.asm
+	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
+        
+# spits out mersenne primes
+mersenne: mersenne.o
+	$(CC) mersenne.o $(LIBNAME) -o mersenne
+
+# fines DR safe primes for the given config
+drprime: drprime.o
+	$(CC) drprime.o $(LIBNAME) -o drprime
+	
+# fines 2k safe primes for the given config
+2kprime: 2kprime.o
+	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
+
+mont: mont.o
+	$(CC) mont.o $(LIBNAME) -o mont
+
+        
+clean:
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224  20297071
-448  15151383
-672  13088682
-896  11111587
-1120   9240621
-1344   8221878
-1568   7227434
-1792   6718051
-2016   6042524
-2240   5685200
-2464   5240465
-2688   4818032
-2912   4412794
-3136   4155883
-3360   3927078
-3584   3722138
+224      1572
+448      1740
+672      1902
+896      2116
+1120      2324
+1344      2484
+1568      2548
+1792      2772
+2016      2958
+2240      3058
+2464      3276
+2688      3436
+2912      3542
+3136      3702
+3360      3926
+3584      4074
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -1,7 +1,7 @@
-513       745
-769       282
-1025       130
-2049        20
-2561        11
-3073         6
-4097         2
+513  19933908
+769  55707832
+1025 119872576
+2049 856114218
+2561 1602741360
+3073 2718192748
+4097 6264335828
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -1,6 +1,6 @@
-521       783
-607       585
-1279       138
-2203        39
-3217        15
-4253         6
+521  18847776
+607  24665920
+1279 110036220
+2203 414562036
+3217 1108350966
+4253 2286079370
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -1,7 +1,7 @@
-532      1296
-784       551
-1036       283
-1540       109
-2072        52
-3080        18
-4116         7
+532   9656134
+784  23022274
+1036  45227854
+1540 129652848
+2072 280625626
+3080 845619480
+4116 1866206400
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -1,17 +1,17 @@
-set terminal png
-set size 1.75
-set ylabel "Operations per Second"
-set xlabel "Operand size (bits)"
-
-set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
-set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
-set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
-
-set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
-
+set terminal png
+set size 1.75
+set ylabel "Cycles per Operation"
+set xlabel "Operand size (bits)"
+
+set output "addsub.png"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+
+set output "mult.png"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+
+set output "expt.png"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
+
+set output "invmod.png"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
+
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +0,0 @@
-112     17364
-224      8643
-336      8867
-448      6228
-560      4737
-672      2259
-784      2899
-896      1497
-1008      1238
-1120      1010
-1232       870
-1344      1265
-1456      1102
-1568       981
-1680       539
-1792       484
-1904       722
-2016       392
-2128       604
-2240       551
-2352       511
-2464       469
-2576       263
-2688       247
-2800       227
-2912       354
-3024       336
-3136       312
-3248       296
-3360       166
-3472       155
-3584       248
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/k7/README
+++ b/logs/k7/README
@ -1,13 +0,0 @@
-To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
-Todo this type 
-
-make timing ; ltmtest
-
-in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
-
-After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
-them all :-)
-
-Have fun
-
-Tom
--- a/logs/k7/add.log
+++ b/logs/k7/add.log
@ -1,16 +0,0 @@
-224  11069160
-448   9156136
-672   8089755
-896   7399424
-1120   6389352
-1344   5818648
-1568   5257112
-1792   4982160
-2016   4527856
-2240   4325312
-2464   4051760
-2688   3767640
-2912   3612520
-3136   3415208
-3360   3258656
-3584   3113360
--- a/logs/k7/addsub.png
+++ b/logs/k7/addsub.png
--- a/logs/k7/expt.log
+++ b/logs/k7/expt.log
@ -1,7 +0,0 @@
-513       664
-769       256
-1025       117
-2049        17
-2561         9
-3073         5
-4097         2
--- a/logs/k7/expt.png
+++ b/logs/k7/expt.png
--- a/logs/k7/expt_dr.log
+++ b/logs/k7/expt_dr.log
@ -1,7 +0,0 @@
-532      1088
-784       460
-1036       240
-1540        92
-2072        43
-3080        15
-4116         6
--- a/logs/k7/graphs.dem
+++ b/logs/k7/graphs.dem
@ -1,17 +0,0 @@
-set terminal png color
-set size 1.75
-set ylabel "Operations per Second"
-set xlabel "Operand size (bits)"
-
-set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
-set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
-set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
-
-set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
-
--- a/logs/k7/index.html
+++ b/logs/k7/index.html
@ -1,24 +0,0 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
-</html>
--- a/logs/k7/invmod.log
+++ b/logs/k7/invmod.log
@ -1,32 +0,0 @@
-112     16248
-224      8192
-336      5320
-448      3560
-560      2728
-672      2064
-784      1704
-896      2176
-1008      1184
-1120       976
-1232      1280
-1344      1176
-1456       624
-1568       912
-1680       504
-1792       452
-1904       658
-2016       608
-2128       336
-2240       312
-2352       288
-2464       264
-2576       408
-2688       376
-2800       354
-2912       198
-3024       307
-3136       173
-3248       162
-3360       256
-3472       145
-3584       226
--- a/logs/k7/invmod.png
+++ b/logs/k7/invmod.png
--- a/logs/k7/mult.log
+++ b/logs/k7/mult.log
@ -1,17 +0,0 @@
-896    322904
-1344    151592
-1792     90472
-2240     59984
-2688     42624
-3136     31872
-3584     24704
-4032     19704
-4480     16096
-4928     13376
-5376     11272
-5824      9616
-6272      8360
-6720      7304
-7168      1664
-7616      1472
-8064      1328
--- a/logs/k7/mult.png
+++ b/logs/k7/mult.png
--- a/logs/k7/mult_kara.log
+++ b/logs/k7/mult_kara.log
@ -1,17 +0,0 @@
-896    322872
-1344    151688
-1792     90480
-2240     59984
-2688     42656
-3136     32144
-3584     25840
-4032     21328
-4480     17856
-4928     14928
-5376     12856
-5824     11256
-6272      9880
-6720      8984
-7168      7928
-7616      7200
-8064      6576
--- a/logs/k7/sqr.log
+++ b/logs/k7/sqr.log
@ -1,17 +0,0 @@
-896    415472
-1344    223736
-1792    141232
-2240     97624
-2688     71400
-3136     54800
-3584     16904
-4032     13528
-4480     10968
-4928      9128
-5376      7784
-5824      6672
-6272      5760
-6720      5056
-7168      4440
-7616      3952
-8064      3512
--- a/logs/k7/sqr_kara.log
+++ b/logs/k7/sqr_kara.log
@ -1,17 +0,0 @@
-896    420464
-1344    224800
-1792    142808
-2240     97704
-2688     71416
-3136     54504
-3584     38320
-4032     32360
-4480     27576
-4928     23840
-5376     20688
-5824     18264
-6272     16176
-6720     14440
-7168     11688
-7616     10752
-8064      9936
--- a/logs/k7/sub.log
+++ b/logs/k7/sub.log
@ -1,16 +0,0 @@
-224   9728504
-448   8573648
-672   7488096
-896   6714064
-1120   5950472
-1344   5457400
-1568   5038896
-1792   4683632
-2016   4384656
-2240   4105976
-2464   3871608
-2688   3650680
-2912   3463552
-3136   3290016
-3360   3135272
-3584   2993848
--- a/logs/mult.log
+++ b/logs/mult.log
@ -1,33 +1,33 @@
-920    374785
-1142    242737
-1371    176704
-1596    134341
-1816    105537
-2044     85089
-2268     70051
-2490     58671
-2716     49851
-2937     42881
-3162     37288
-3387     32697
-3608     28915
-3836     25759
-4057     23088
-4284     20800
-4508     18827
-4730     17164
-4956     15689
-5180     14397
-5398     13260
-5628     12249
-5852     11346
-6071     10537
-6298      9812
-6522      9161
-6742      8572
-6971      8038
-7195      2915
-7419      2744
-7644      2587
-7866      2444
-8090      2311
+923     45612
+1143     68010
+1370     94894
+1596    126514
+1820    163014
+2044    203564
+2268    249156
+2492    299226
+2716    354138
+2940    413022
+3163    477406
+3387    545876
+3612    619044
+3835    696754
+4060    779174
+4284    866216
+4508    958100
+4731   1055898
+4954   1162294
+5179   1267654
+5404   1377572
+5628   1503736
+5852   1622310
+6076   1746624
+6299   1875390
+6524   2009086
+6748   2145990
+6971   2289044
+7196   2891644
+7418   3064792
+7644   3249780
+7868   3455868
+8092   3644238
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,33 +1,33 @@
-924    374171
-1147    243163
-1371    177111
-1596    134465
-1819    105619
-2044     85145
-2266     70086
-2488     58717
-2715     49869
-2939     42894
-3164     37389
-3387     33510
-3610     29993
-3836     27205
-4060     24751
-4281     22576
-4508     20670
-4732     19019
-4954     17527
-5180     16217
-5404     15044
-5624     14003
-5849     13051
-6076     12067
-6300     11438
-6524     10772
-6748     10298
-6972      9715
-7195      9330
-7416      8836
-7644      8465
-7864      8042
-8091      7735
+921     92388
+1148     61410
+1372     43799
+1594     33047
+1819     26913
+2043     21996
+2268     18453
+2492     15623
+2715     13378
+2940     11626
+3164     10252
+3385      9291
+3610      8348
+3835      7615
+4060      6928
+4283      6401
+4508      5836
+4732      5387
+4955      4985
+5178      4614
+5404      4300
+5622      4005
+5852      3742
+6073      3502
+6298      3262
+6524      3137
+6748      2967
+6971      2807
+7195      2679
+7420      2571
+7643      2442
+7867      2324
+8091      2235
--- a/logs/p4/README
+++ b/logs/p4/README
@ -1,13 +0,0 @@
-To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
-Todo this type 
-
-make timing ; ltmtest
-
-in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
-
-After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
-them all :-)
-
-Have fun
-
-Tom
--- a/logs/p4/add.log
+++ b/logs/p4/add.log
@ -1,16 +0,0 @@
-224   8113248
-448   6585584
-672   5687678
-896   4761144
-1120   4111592
-1344   3995154
-1568   3532387
-1792   3225400
-2016   2963960
-2240   2720112
-2464   2533952
-2688   2307168
-2912   2287064
-3136   2150160
-3360   2035992
-3584   1936304
--- a/logs/p4/addsub.png
+++ b/logs/p4/addsub.png
--- a/logs/p4/expt.log
+++ b/logs/p4/expt.log
@ -1,7 +0,0 @@
-513       195
-769        68
-1025        31
-2049         4
-2561         2
-3073         1
-4097         0
--- a/logs/p4/expt.png
+++ b/logs/p4/expt.png
--- a/logs/p4/expt_dr.log
+++ b/logs/p4/expt_dr.log
@ -1,7 +0,0 @@
-532       393
-784       158
-1036        79
-1540        27
-2072        12
-3080         4
-4116         1
--- a/logs/p4/graphs.dem
+++ b/logs/p4/graphs.dem
@ -1,17 +0,0 @@
-set terminal png color
-set size 1.75
-set ylabel "Operations per Second"
-set xlabel "Operand size (bits)"
-
-set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
-set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
-set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
-
-set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
-
--- a/logs/p4/index.html
+++ b/logs/p4/index.html
@ -1,24 +0,0 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
-</html>
--- a/logs/p4/invmod.log
+++ b/logs/p4/invmod.log
@ -1,32 +0,0 @@
-112     13608
-224      6872
-336      4264
-448      2792
-560      2144
-672      1560
-784      1296
-896      1672
-1008       896
-1120       736
-1232      1024
-1344       888
-1456       472
-1568       680
-1680       373
-1792       328
-1904       484
-2016       436
-2128       232
-2240       211
-2352       200
-2464       177
-2576       293
-2688       262
-2800       251
-2912       137
-3024       216
-3136       117
-3248       113
-3360       181
-3472        98
-3584       158
--- a/logs/p4/invmod.png
+++ b/logs/p4/invmod.png
--- a/logs/p4/mult.log
+++ b/logs/p4/mult.log
@ -1,17 +0,0 @@
-896     77600
-1344     35776
-1792     19688
-2240     13248
-2688      9424
-3136      7056
-3584      5464
-4032      4368
-4480      3568
-4928      2976
-5376      2520
-5824      2152
-6272      1872
-6720      1632
-7168       650
-7616       576
-8064       515
--- a/logs/p4/mult.png
+++ b/logs/p4/mult.png
--- a/logs/p4/mult_kara.log
+++ b/logs/p4/mult_kara.log
@ -1,17 +0,0 @@
-896     77752
-1344     35832
-1792     19688
-2240     14704
-2688     10832
-3136      8336
-3584      6600
-4032      5424
-4480      4648
-4928      3976
-5376      3448
-5824      3016
-6272      2664
-6720      2384
-7168      2120
-7616      1912
-8064      1752
--- a/logs/p4/sqr.log
+++ b/logs/p4/sqr.log
@ -1,17 +0,0 @@
-896    128088
-1344     63640
-1792     37968
-2240     25488
-2688     18176
-3136     13672
-3584      4920
-4032      3912
-4480      3160
-4928      2616
-5376      2216
-5824      1896
-6272      1624
-6720      1408
-7168      1240
-7616      1096
-8064       984
--- a/logs/p4/sqr_kara.log
+++ b/logs/p4/sqr_kara.log
@ -1,17 +0,0 @@
-896    127456
-1344     63752
-1792     37920
-2240     25440
-2688     18200
-3136     13728
-3584     10968
-4032      9072
-4480      7608
-4928      6440
-5376      5528
-5824      4768
-6272      4328
-6720      3888
-7168      3504
-7616      3176
-8064      2896
--- a/logs/p4/sub.log
+++ b/logs/p4/sub.log
@ -1,16 +0,0 @@
-224   7355896
-448   6162880
-672   5218984
-896   4622776
-1120   3999320
-1344   3629480
-1568   3290384
-1792   2954752
-2016   2737056
-2240   2563320
-2464   2451928
-2688   2310920
-2912   2139048
-3136   2034080
-3360   1890800
-3584   1808624
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -1,33 +1,33 @@
-922    471095
-1147    337137
-1366    254327
-1596    199732
-1819    161225
-2044    132852
-2268    111493
-2490     94864
-2715     81745
-2940     71187
-3162     62575
-3387     55418
-3612     14540
-3836     12944
-4060     11627
-4281     10546
-4508      9502
-4730      8688
-4954      7937
-5180      7273
-5402      6701
-5627      6189
-5850      5733
-6076      5310
-6300      4933
-6522      4631
-6748      4313
-6971      4064
-7196      3801
-7420      3576
-7642      3388
-7868      3191
-8092      3020
+924     26026
+1146     37682
+1370     51714
+1595     68130
+1820     86850
+2043    107880
+2267    131236
+2490    156828
+2716    184704
+2940    214934
+3162    247424
+3388    282494
+3608    308390
+3834    345978
+4060    386156
+4282    427648
+4505    471556
+4731    517948
+4954    566396
+5180    618292
+5402    670130
+5628    725674
+5852    783310
+6076    843480
+6300    905136
+6524    969132
+6748   1033680
+6971   1100912
+7195   1170954
+7420   1252576
+7643   1325038
+7867   1413890
+8091   1493140
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,33 +1,33 @@
-922    470930
-1148    337217
-1372    254433
-1596    199827
-1820    161204
-2043    132871
-2267    111522
-2488     94932
-2714     81814
-2939     71231
-3164     62616
-3385     55467
-3611     44426
-3836     40695
-4060     37391
-4283     34371
-4508     31779
-4732     29499
-4956     27426
-5177     25598
-5403     23944
-5628     22416
-5851     21052
-6076     19781
-6299     18588
-6523     17539
-6746     16618
-6972     15705
-7196     13582
-7420     13004
-7643     12496
-7868     11963
-8092     11497
+923    165854
+1146    112539
+1372     80388
+1595     60051
+1820     47498
+2044     38017
+2268     31935
+2492     27373
+2714     23798
+2939     20630
+3164     18198
+3388     16191
+3612     14538
+3836     13038
+4058     11683
+4284     10915
+4508      9998
+4731      9271
+4954      8555
+5180      7910
+5404      7383
+5628      7012
+5852      6527
+6075      6175
+6299      5737
+6524      5398
+6744      5110
+6971      4864
+7196      4567
+7420      4371
+7644      4182
+7868      3981
+8092      3758
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224  16370431
-448  13327848
-672  11009401
-896   9125342
-1120   7930419
-1344   7114040
-1568   6506998
-1792   5899346
-2016   5435327
-2240   5038931
-2464   4696364
-2688   4425678
-2912   4134476
-3136   3913280
-3360   3692536
-3584   3505219
+224      2012
+448      2208
+672      2366
+896      2532
+1120      2682
+1344      2838
+1568      3016
+1792      3146
+2016      3318
+2240      3538
+2464      3756
+2688      3914
+2912      4060
+3136      4216
+3360      4392
+3584      4550
--- a/40
+++ b/40
@ -12,7 +12,10 @@ CFLAGS += -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer

-VERSION=0.30
+#debug
+#CFLAGS += -g3
+
+VERSION=0.31

 default: libtommath.a

@ -20,7 +23,7 @@ default: libtommath.a
 LIBNAME=libtommath.a
 HEADERS=tommath.h

-#LIBPATH-The directory for libtomcrypt to be installed to.
+#LIBPATH-The directory for libtommath to be installed to.
 #INCPATH-The directory to install the header files for libtommath.
 #DATAPATH-The directory to install the pdf docs.
 DESTDIR=
@ -58,6 +61,30 @@ libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
 	ranlib libtommath.a

+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make CFLAGS="$(CFLAGS) -fprofile-arcs -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make CFLAGS="$(CFLAGS) -fbranch-probabilities"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -fprofile-arcs -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/timing.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
 install: libtommath.a
 	install -d -g root -o root $(DESTDIR)$(LIBPATH)
 	install -d -g root -o root $(DESTDIR)$(INCPATH)
@ -71,7 +98,7 @@ mtest: test
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest -s
        
 timing: libtommath.a
-	$(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s

 # makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
 docdvi: tommath.src
@ -106,10 +133,13 @@ mandvi: bn.tex
 manual:	mandvi
 	pdflatex bn >/dev/null
 	rm -f bn.aux bn.dvi bn.log bn.idx bn.lof bn.out bn.toc
-	
+
+pretty: 
+	perl pretty.build
+
 clean:
 	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c 
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex *~ demo/*~ etc/*~
 	cd etc ; make clean
 	cd pics ; make clean

--- a/makefile.bcc
+++ b/makefile.bcc
@ -30,7 +30,8 @@ bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj 

 TARGET = libtommath.lib

--- a/makefile.cygwin_dll
+++ b/makefile.cygwin_dll
@ -35,7 +35,8 @@ bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
-bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o

 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
--- a/makefile.icc
+++ b/makefile.icc
@ -0,0 +1,110 @@
+#Makefile for ICC
+#
+#Tom St Denis
+CC=icc
+
+CFLAGS  +=  -I./
+
+# optimize for SPEED
+#
+# -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
+# -ax?   specifies make code specifically for ? but compatible with IA-32
+# -x?    specifies compile solely for ? [not specifically IA-32 compatible]
+#
+# where ? is 
+#   K - PIII
+#   W - first P4 [Williamette]
+#   N - P4 Northwood
+#   P - P4 Prescott
+#   B - Blend of P4 and PM [mobile]
+#
+# Default to just generic max opts
+CFLAGS += -O3 -xN
+
+default: libtommath.a
+
+#default files to install
+LIBNAME=libtommath.a
+HEADERS=tommath.h
+
+#LIBPATH-The directory for libtomcrypt to be installed to.
+#INCPATH-The directory to install the header files for libtommath.
+#DATAPATH-The directory to install the pdf docs.
+DESTDIR=
+LIBPATH=/usr/lib
+INCPATH=/usr/include
+DATAPATH=/usr/share/doc/libtommath/pdf
+
+OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
+bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
+bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
+bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
+bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
+bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
+bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
+bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
+bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
+bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
+bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
+bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
+bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
+bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
+bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
+bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
+bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
+bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
+bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
+bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
+bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
+bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o
+
+libtommath.a:  $(OBJECTS)
+	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
+	ranlib libtommath.a
+
+#make a profiled library (takes a while!!!)
+#
+# This will build the library with profile generation
+# then run the test demo and rebuild the library.
+# 
+# So far I've seen improvements in the MP math
+profiled:
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_gen -DTESTING" timing
+	./ltmtest
+	rm -f *.a *.o ltmtest
+	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_use"
+
+#make a single object profiled library 
+profiled_single:
+	perl gen.pl
+	$(CC) $(CFLAGS) -prof_gen -DTESTING -c mpi.c -o mpi.o
+	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/demo.c mpi.o -o ltmtest
+	./ltmtest
+	rm -f *.o ltmtest
+	$(CC) $(CFLAGS) -prof_use -ip -DTESTING -c mpi.c -o mpi.o
+	$(AR) $(ARFLAGS) libtommath.a mpi.o
+	ranlib libtommath.a	
+
+install: libtommath.a
+	install -d -g root -o root $(DESTDIR)$(LIBPATH)
+	install -d -g root -o root $(DESTDIR)$(INCPATH)
+	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
+	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
+
+test: libtommath.a demo/demo.o
+	$(CC) demo/demo.o libtommath.a -o test
+	
+mtest: test	
+	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
+        
+timing: libtommath.a
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
+
+clean:
+	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.il etc/*.il *.dyn
+	cd etc ; make clean
+	cd pics ; make clean
--- a/makefile.msvc
+++ b/makefile.msvc
@ -29,7 +29,8 @@ bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
+bn_mp_init_set.obj bn_mp_init_set_int.obj

 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
--- a/poster.pdf
+++ b/poster.pdf
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -452,7 +452,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  }

  /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
  c->used = digs;

  {
@ -779,7 +779,7 @@ mp_2expt (mp_int * a, int b)
  a->used = b / DIGIT_BIT + 1;

  /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);

  return MP_OKAY;
 }
@ -1142,10 +1142,14 @@ mp_clamp (mp_int * a)
 void
 mp_clear (mp_int * a)
 {
+  int i;
+
  /* only do anything if a hasn't been freed previously */
  if (a->dp != NULL) {
    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
+        a->dp[i] = 0;
+    }

    /* free ram */
    XFREE(a->dp);
@ -1677,7 +1681,7 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   */
  
  /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;

  if (c != NULL) {
    mp_clamp (&q);
@ -3083,15 +3087,22 @@ int mp_grow (mp_int * a, int size)
 */
 #include <tommath.h>

-/* init a new bigint */
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
+  int i;
+
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }

+  /* set the digits to zero */
+  for (i = 0; i < MP_PREC; i++) {
+      a->dp[i] = 0;
+  }
+
  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
@ -3753,9 +3764,6 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    goto X0Y0;

  /* now shift the digits */
-  x0.sign = x1.sign = a->sign;
-  y0.sign = y1.sign = b->sign;
-
  x0.used = y0.used = B;
  x1.used = a->used - B;
  y1.used = b->used - B;
@ -4484,7 +4492,7 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
      res = s_mp_mul (a, b, c);
    }
  }
-  c->sign = neg;
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
  return res;
 }

@ -6090,7 +6098,8 @@ mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
+   mp_digit iz;
   
   if (a->used == 0) {
      return 0;
@ -6107,7 +6116,7 @@ int mp_reduce_is_2k(mp_int *a)
             return 0;
          }
          iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
             ++iw;
             iz = 1;
          }
@ -8396,14 +8405,16 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)

 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
- AMD Athlon XP          /GCC v3.2     /       109/       127
-
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        81/profiled build
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        80/profiled_single build
+ Intel P4 Northwood     /ICC v8.0     /        57/        70/profiled build
+ Intel P4 Northwood     /ICC v8.0     /        54/        76/profiled_single build
+ AMD Athlon XP          /GCC v3.2     /       109/       127/
+ 
 */

-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
-        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
+int     KARATSUBA_MUL_CUTOFF = 57,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 70,      /* Min. number of digits before Karatsuba squaring is used. */
        
        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
        TOOM_SQR_CUTOFF      = 400; 
--- a/pretty.build
+++ b/pretty.build
@ -0,0 +1,66 @@
+#!/bin/perl -w
+#
+# Cute little builder for perl 
+# Total waste of development time...
+#
+# This will build all the object files and then the archive .a file
+# requires GCC, GNU make and a sense of humour.
+#
+# Tom St Denis
+use strict;
+
+my $count = 0;
+my $starttime = time;
+my $rate  = 0;
+print "Scanning for source files...\n";
+foreach my $filename (glob "*.c") {
+       ++$count;
+}
+print "Source files to build: $count\nBuilding...\n";
+my $i = 0;
+my $lines = 0;
+my $filesbuilt = 0;
+foreach my $filename (glob "*.c") {
+       printf("Building %3.2f%%, ", (++$i/$count)*100.0);
+       if ($i % 4 == 0) { print "/, "; }
+       if ($i % 4 == 1) { print "-, "; }
+       if ($i % 4 == 2) { print "\\, "; }
+       if ($i % 4 == 3) { print "|, "; }
+       if ($rate > 0) {
+           my $tleft = ($count - $i) / $rate;
+           my $tsec  = $tleft%60;
+           my $tmin  = ($tleft/60)%60;
+           my $thour = ($tleft/3600)%60;
+           printf("%2d:%02d:%02d left, ", $thour, $tmin, $tsec);
+       }
+       my $cnt = ($i/$count)*30.0;
+       my $x   = 0;
+       print "[";
+       for (; $x < $cnt; $x++) { print "#"; }
+       for (; $x < 30; $x++)   { print " "; }
+       print "]\r";
+       my $tmp = $filename;
+       $tmp =~ s/\.c/".o"/ge;
+       if (open(SRC, "<$tmp")) {
+          close SRC;
+       } else {
+          !system("make $tmp > /dev/null 2>/dev/null") or die "\nERROR: Failed to make $tmp!!!\n";
+          open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
+          ++$lines while (<SRC>);
+          close SRC or die "Error closing $filename after reading: $!";
+          ++$filesbuilt;
+       }      
+
+       # update timer 
+       if (time != $starttime) {
+          my $delay = time - $starttime;
+          $rate = $i/$delay;
+       }
+}
+
+# finish building the library 
+printf("\nFinished building source (%d seconds, %3.2f files per second).\n", time - $starttime, $rate);
+print "Compiled approximately $filesbuilt files and $lines lines of code.\n";
+print "Doing final make (building archive...)\n";
+!system("make > /dev/null 2>/dev/null") or die "\nERROR: Failed to perform last make command!!!\n";
+print "done.\n";
--- a/tommath.pdf
+++ b/tommath.pdf
--- a/tommath.src
+++ b/tommath.src
@ -258,7 +258,7 @@ floating point is meant to be implemented in hardware the precision of the manti
 a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
 scientific applications must minimize the total output error over long calculations.

-Another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
 In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.

 \subsection{Benefits of Multiple Precision Arithmetic}
@ -316,7 +316,7 @@ the reader how the algorithms fit together as well as where to start on various

 \section{Discussion and Notation}
 \subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1} ... x_1 x_0)_{ \beta }$ and represent
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
 the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
 of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
 $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
@ -339,12 +339,11 @@ algorithms will be used to establish the relevant theory which will subsequently
 precision algorithm to solve the same problem.  

 \subsection{Precision Notation}
-For the purposes of this text a single precision variable must be able to represent integers in the range 
-$0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
-$0 \le x < q \beta^2$.  The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
-must be of the form $q^p$ for $q, p \in \Z^+$.  The extra radix-$q$ factor allows additions and subtractions to proceed 
-without truncation of the carry.  Since all modern computers are binary, it is assumed that $q$ is two, for all intents 
-and purposes.
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.

 \index{mp\_digit} \index{mp\_word}
 Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
@ -376,7 +375,7 @@ the $/$ division symbol is used the intention is to perform an integer division
 $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
 fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  

-The norm of a multiple precision integer, for example, $\vert \vert x \vert \vert$ will be used to represent the number of digits in the representation
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
 of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  

 \subsection{Work Effort}
@ -569,7 +568,7 @@ By building outwards from a base foundation instead of using a parallel design m
 highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
 has a small footprint and updates are easy to perform.  

-Usually when I start a project I will begin with the header file.  I define the data types I think I will need and 
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
 prototype the initial functions that are not dependent on other functions (within the library).  After I 
 implement these base functions I prototype more dependent functions and implement them.   The process repeats until
 I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
@ -619,14 +618,26 @@ any such data type but it does provide for making composite data types known as
 used within LibTomMath.

 \index{mp\_int}
-\begin{verbatim}
-typedef struct  {
-    int used, alloc, sign;
-    mp_digit *dp;
-} mp_int;
-\end{verbatim}
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+%\begin{verbatim}
+\begin{tabular}{|l|}
+\hline
+typedef struct \{ \\
+\hspace{3mm}int used, alloc, sign;\\
+\hspace{3mm}mp\_digit *dp;\\
+\} \textbf{mp\_int}; \\
+\hline
+\end{tabular}
+%\end{verbatim}
+\end{small}
+\caption{The mp\_int Structure}
+\label{fig:mpint}
+\end{center}
+\end{figure}

-The mp\_int structure can be broken down as follows.
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.

 \begin{enumerate}
 \item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
@ -701,9 +712,10 @@ fault by dereferencing memory not owned by the application.
 In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
 instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
 will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
-\textbf{int} data type with one of the following values.
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).

 \index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
+\begin{figure}[here]
 \begin{center}
 \begin{tabular}{|l|l|}
 \hline \textbf{Value} & \textbf{Meaning} \\
@ -713,6 +725,9 @@ will it check pointers for validity.  Any function that can cause a runtime erro
 \hline
 \end{tabular}
 \end{center}
+\caption{LibTomMath Error Codes}
+\label{fig:errcodes}
+\end{figure}

 When an error is detected within a function it should free any memory it allocated, often during the initialization of
 temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
@ -748,6 +763,7 @@ to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{M
 An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
 structure are set to valid values.  The mp\_init algorithm will perform such an action.

+\index{mp\_init}
 \begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
@ -770,17 +786,23 @@ structure are set to valid values.  The mp\_init algorithm will perform such an
 \end{figure}

 \textbf{Algorithm mp\_init.}
-The \textbf{MP\_PREC} name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
-used to dictate the minimum precision of allocated mp\_int integers.  Ideally, it is at least equal to $32$ since for most
-purposes that will be more than enough.
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
+a valid assumption if the input resides on the stack.  

-Memory for the default number of digits is allocated first.  If the allocation fails the algorithm returns immediately
-with the \textbf{MP\_MEM} error code.  If the allocation succeeds the remaining members of the mp\_int structure
-must be initialized to reflect the default initial state.
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
+precision number you'll be working with.

-The allocated digits are all set to zero (step three) to ensure they are in a known state.  The \textbf{sign}, \textbf{used}
-and \textbf{alloc} are subsequently initialized to represent the zero integer.  By step seven the algorithm returns a success 
-code and the mp\_int $a$ has been successfully initialized to a valid state representing the integer zero.  
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
+memory and the number of heap operations will be trivial.
+
+Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
+\textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
+of the original condition of the input.

 \textbf{Remark.}
 This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
@ -796,19 +818,21 @@ One immediate observation of this initializtion function is that it does not ret
 is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
 call to mp\_init() is used only to initialize the members of the structure to a known default state.  

-Before any of the other members of the structure are initialized memory from the application heap is allocated with
-the calloc() function (line @22,calloc@).  The size of the allocated memory is large enough to hold \textbf{MP\_PREC} 
-mp\_digit variables.  The calloc() function is used instead\footnote{calloc() will allocate memory in the same
-manner as malloc() except that it also sets the contents to zero upon successfully allocating the memory.} of malloc() 
-since digits have to be set to zero for the function to finish correctly.  The \textbf{OPT\_CAST} token is a macro 
-definition which will turn into a cast from void * to mp\_digit * for C++ compilers.  It is not required for C compilers.
+Here we see (line @23,XMALLOC@) the memory allocation is performed first.  This allows us to exit cleanly and quickly
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
+memory allocation routine.

-After the memory has been successfully allocated the remainder of the members are initialized 
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
+accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
+portable fashion you have to actually assign the value.  The for loop (line @28,for@) performs this required
+operation.
+
+After the memory has been successfully initialized the remainder of the members are initialized 
 (lines @29,used@ through @31,sign@) to their respective default states.  At this point the algorithm has succeeded and
-a success code is returned to the calling function.
-
-If this function returns \textbf{MP\_OKAY} it is safe to assume the mp\_int structure has been properly initialized and
-is safe to use with other functions within the library.  
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  

 \subsection{Clearing an mp\_int}
 When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
@ -819,7 +843,7 @@ returned to the application's memory pool with the mp\_clear algorithm.
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_clear}. \\
 \textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ is freed for reuse.  \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
 \hline \\
 1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
 2.  for $n$ from 0 to $a.used - 1$ do \\
@ -836,32 +860,31 @@ returned to the application's memory pool with the mp\_clear algorithm.
 \end{figure}

 \textbf{Algorithm mp\_clear.}
-This algorithm releases the memory allocated for an mp\_int back into the memory pool for reuse.  It is designed
-such that a given mp\_int structure can be cleared multiple times between initializations without attempting to 
-free the memory twice\footnote{In ISO C for example, calling free() twice on the same memory block causes undefinied
-behaviour.}.  
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
+is to free the allocated memory.

-The first step determines if the mp\_int structure has been marked as free already.  If it has, the algorithm returns
-success immediately as no further actions are required.  Otherwise, the algorithm will proceed to put the structure 
-in a known empty and otherwise invalid state.  First the digits of the mp\_int are set to zero.  The memory that has been allocated for the 
-digits is then freed.  The \textbf{used} and \textbf{alloc} counts are both set to zero and the \textbf{sign} set to 
-\textbf{MP\_ZPOS}.  This known fixed state for cleared mp\_int structures will make debuging easier for the end 
-developer.  That is, if they spot (via their debugger) an mp\_int they are using that is in this state it will be 
-obvious that they erroneously and prematurely cleared the mp\_int structure.
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
+digit pointer \textbf{dp} setting.

-Note that once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
 with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.

 EXAM,bn_mp_clear.c

-The ``if'' statement (line @21,a->dp != NULL@) prevents the heap from being corrupted if a user double-frees an 
-mp\_int.  This is because once the memory is freed the pointer is set to \textbf{NULL} (line @30,NULL@).  
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line @23,a->dp != NULL@)
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
+\textbf{NULL} in which case the if statement will evaluate to true.

-Without the check, code that accidentally calls mp\_clear twice for a given mp\_int structure would try to free the memory 
-allocated for the digits twice.  This may cause some C libraries to signal a fault.  By setting the pointer to 
-\textbf{NULL} it helps debug code that may inadvertently free the mp\_int before it is truly not needed, because attempts 
-to reference digits should fail immediately. The allocated digits are set to zero before being freed (line @24,memset@).  
-This is ideal for cryptographic situations where the integer that the mp\_int represents might need to be kept a secret.
+The digits of the mp\_int are cleared by the for loop (line @25,for@) which assigns a zero to every digit.  Similar to mp\_init()
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
+
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
+still has to be reset to \textbf{NULL} manually (line @33,NULL@).  
+
+Now that the digits have been cleared and deallocated the other members are set to their final values (lines @34,= 0@ and @35,ZPOS@).

 \section{Maintenance Algorithms}

@ -889,7 +912,7 @@ must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm
 1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
 2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
 3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-Allocate the array of digits $a$ to size $v$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
 5.  If the allocation failed then return(\textit{MP\_MEM}). \\
 6.  for n from a.alloc to $v - 1$ do  \\
 \hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
@ -914,15 +937,19 @@ assumed to contain undefined values they are initially set to zero.

 EXAM,bn_mp_grow.c

-The first step is to see if we actually need to perform a re-allocation at all (line @24,a->alloc < size@).  If a reallocation
-must occur the digit count is padded upwards to help prevent many trivial reallocations (line @28,size@).  Next the reallocation is performed
-and the return of realloc() is stored in a temporary pointer named $tmp$ (line @36,realloc@).  The return is stored in a temporary
-instead of $a.dp$ to prevent the code from losing the original pointer in case the reallocation fails.  Had the return been stored 
-in $a.dp$ instead there would be no way to reclaim the heap originally used.
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
+the function skips the re-allocation part thus saving time.

-If the reallocation fails the function will return \textbf{MP\_MEM} (line @39,return@), otherwise, the value of $tmp$ is assigned
-to the pointer $a.dp$ and the function continues.  A simple for loop from line @48,a->alloc@ to line @50,}@ will zero all digits 
-that were above the old \textbf{alloc} limit to make sure the integer is in a known state.
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line @25, size@).  The XREALLOC function is used
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
+function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
+the re-allocation.  All	that is left is to clear the newly allocated digits and return.
+
+Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
+an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
+result in a memory leak if XREALLOC ever failed.  

 \subsection{Initializing Variable Precision mp\_ints}
 Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
@ -970,7 +997,7 @@ The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmen
 mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
 returned (line @27,return@).  

-The digits are allocated and set to zero at the same time with the calloc() function (line @25,calloc@).  The 
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
 \textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
 to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@).  If the function 
 returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
--- a/tommath.tex
+++ b/tommath.tex