added libtommath-0.31

2004-08-09 22:15:59 +00:00 · 2004-08-09 22:15:59 +00:00 · 8eaa98807b
parent 350578d400
commit 8eaa98807b
75 changed files with 5111 additions and 5218 deletions
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.30}
+\title{LibTomMath User Manual \\ v0.31}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
--- a/bn_fast_s_mp_mul_digs.c
+++ b/bn_fast_s_mp_mul_digs.c
@ -88,7 +88,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  }
  /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
  c->used = digs;
  {
--- a/bn_mp_2expt.c
+++ b/bn_mp_2expt.c
@ -36,7 +36,7 @@ mp_2expt (mp_int * a, int b)
  a->used = b / DIGIT_BIT + 1;
  /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
  return MP_OKAY;
 }
--- a/bn_mp_clear.c
+++ b/bn_mp_clear.c
@ -18,10 +18,14 @@
 void
 mp_clear (mp_int * a)
 {
  int i;
  /* only do anything if a hasn't been freed previously */
  if (a->dp != NULL) {
    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
        a->dp[i] = 0;
    }
    /* free ram */
    XFREE(a->dp);
--- a/bn_mp_div.c
+++ b/bn_mp_div.c
@ -187,7 +187,7 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   */
  /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
  if (c != NULL) {
    mp_clamp (&q);
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -14,15 +14,22 @@
 */
 #include <tommath.h>
-/* init a new bigint */
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
  int i;
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }
  /* set the digits to zero */
  for (i = 0; i < MP_PREC; i++) {
      a->dp[i] = 0;
  }
  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -76,9 +76,6 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    goto X0Y0;
  /* now shift the digits */
  x0.sign = x1.sign = a->sign;
  y0.sign = y1.sign = b->sign;
  x0.used = y0.used = B;
  x1.used = a->used - B;
  y1.used = b->used - B;
--- a/bn_mp_mul.c
+++ b/bn_mp_mul.c
@ -43,6 +43,6 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
      res = s_mp_mul (a, b, c);
    }
  }
-  c->sign = neg;
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
  return res;
 }
--- a/bn_mp_reduce_is_2k.c
+++ b/bn_mp_reduce_is_2k.c
@ -17,7 +17,8 @@
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
   mp_digit iz;
   if (a->used == 0) {
      return 0;
@ -34,7 +35,7 @@ int mp_reduce_is_2k(mp_int *a)
             return 0;
          }
          iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
             ++iw;
             iz = 1;
          }
--- a/bncore.c
+++ b/bncore.c
@ -18,14 +18,16 @@
 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        81/profiled build
- AMD Athlon XP          /GCC v3.2     /       109/       127
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        80/profiled_single build
-
+ Intel P4 Northwood     /ICC v8.0     /        57/        70/profiled build
 Intel P4 Northwood     /ICC v8.0     /        54/        76/profiled_single build
 AMD Athlon XP          /GCC v3.2     /       109/       127/
 */
-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
+int     KARATSUBA_MUL_CUTOFF = 57,      /* Min. number of digits before Karatsuba multiplication is used. */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 70,      /* Min. number of digits before Karatsuba squaring is used. */
        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
        TOOM_SQR_CUTOFF      = 400; 
--- a/booker.pl
+++ b/booker.pl
@ -84,6 +84,7 @@ while (<IN>) {
            $text[$line++] = $_;
            last if ($_ =~ /tommath\.h/);
         }
         <SRC>;   
      }
      $inline = 0;
--- a/changes.txt
+++ b/changes.txt
@ -1,3 +1,12 @@
 August 9th, 2004
 v0.31  -- "profiled" builds now :-) new timings for Intel Northwoods
       -- Added "pretty" build target
       -- Update mp_init() to actually assign 0's instead of relying on calloc()
       -- "Wolfgang Ehrhardt" <Wolfgang.Ehrhardt@munich.netsurf.de> found a bug in mp_mul() where if
          you multiply a negative by zero you get negative zero as the result.  Oops.
       -- J Harper from PeerSec let me toy with his AMD64 and I got 60-bit digits working properly
          [this also means that I fixed a bug where if sizeof(int) < sizeof(mp_digit) it would bug]
 April 11th, 2004
 v0.30  -- Added "mp_toradix_n" which stores upto "n-1" least significant digits of an mp_int
       -- Johan Lindh sent a patch so MSVC wouldn't whine about redefining malloc [in weird dll modes]
--- a/demo/demo.c
+++ b/demo/demo.c
@ -1,7 +1,5 @@
 #include <time.h>
 #define TESTING
 #ifdef IOWNANATHLON
 #include <unistd.h>
 #define SLEEP sleep(4)
@ -11,49 +9,6 @@
 #include "tommath.h"
 #ifdef TIMER
 ulong64 _tt;
 #if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
 /* RDTSC from Scott Duplichan */
 static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
      #ifdef __i386__
         ulong64 a;
         __asm__ __volatile__ ("rdtsc ":"=A" (a));
         return a;
      #else /* gcc-IA64 version */
         unsigned long result;
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         while (__builtin_expect ((int) result == -1, 0))
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         return result;
      #endif
   // Microsoft and Intel Windows compilers
   #elif defined _M_IX86
     __asm rdtsc
   #elif defined _M_AMD64
     return __rdtsc ();
   #elif defined _M_IA64
     #if defined __INTEL_COMPILER
       #include <ia64intrin.h>
     #endif
      return __getReg (3116);
   #else
     #error need rdtsc function for this build
   #endif
   }
 #else
 #define TIMFUNC clock
 #endif
 ulong64 rdtsc(void) { return TIMFUNC() - _tt; }
 void reset(void) { _tt = TIMFUNC(); }
 #endif
 void ndraw(mp_int *a, char *name)
 {
   char buf[4096];
@ -89,10 +44,6 @@ int myrng(unsigned char *dst, int len, void *dat)
 }
 #define DO2(x) x; x;
 #define DO4(x) DO2(x); DO2(x);
 #define DO8(x) DO4(x); DO4(x);
 #define DO(x)  DO8(x); DO8(x);
   char cmd[4096], buf[4096];
 int main(void)
@ -103,10 +54,6 @@ int main(void)
   unsigned rr;
   int i, n, err, cnt, ix, old_kara_m, old_kara_s;
 #ifdef TIMER
   ulong64 tt, CLK_PER_SEC;
   FILE *log, *logb, *logc;
 #endif
   mp_init(&a);
   mp_init(&b);
@ -117,11 +64,10 @@ int main(void)
   srand(time(NULL));
 #ifdef TESTING
  // test mp_get_int
  printf("Testing: mp_get_int\n");
  for(i=0;i<1000;++i) {
-    t = (unsigned long)rand()*rand()+1;
+    t = ((unsigned long)rand()*rand()+1)&0xFFFFFFFF;
    mp_set_int(&a,t);
    if (t!=mp_get_int(&a)) { 
      printf("mp_get_int() bad result!\n");
@ -141,7 +87,7 @@ int main(void)
  // test mp_sqrt
  printf("Testing: mp_sqrt\n");
-  for (i=0;i<10000;++i) { 
+  for (i=0;i<1000;++i) { 
    printf("%6d\r", i); fflush(stdout);
    n = (rand()&15)+1;
    mp_rand(&a,n);
@ -157,7 +103,7 @@ int main(void)
  }
  printf("\nTesting: mp_is_square\n");
-  for (i=0;i<100000;++i) {
+  for (i=0;i<1000;++i) {
    printf("%6d\r", i); fflush(stdout);
    /* test mp_is_square false negatives */
@ -186,11 +132,9 @@ int main(void)
  }
  printf("\n\n");
 #endif
 #ifdef TESTING 
   /* test for size */
-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 10; ix < 256; ix++) {
       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
       err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
       if (err != MP_OKAY) {
@ -203,7 +147,7 @@ int main(void)
       }
   }
-   for (ix = 16; ix < 512; ix++) {
+   for (ix = 16; ix < 256; ix++) {
       printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
       err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
       if (err != MP_OKAY) {
@ -225,9 +169,7 @@ int main(void)
   }
   printf("\n\n");
 #endif
 #ifdef TESTING
   mp_read_radix(&a, "123456", 10);
   mp_toradix_n(&a, buf, 10, 3);
   printf("a == %s\n", buf);
@ -235,7 +177,6 @@ int main(void)
   printf("a == %s\n", buf);
   mp_toradix_n(&a, buf, 10, 30);
   printf("a == %s\n", buf);
 #endif
 #if 0
@ -248,22 +189,6 @@ int main(void)
   }
 #endif
 #if 0
 {
   mp_word aa, bb;
   for (;;) {
       aa = abs(rand()) & MP_MASK;
       bb = abs(rand()) & MP_MASK;
      if (MULT(aa,bb) != (aa*bb)) {
             printf("%llu * %llu == %llu or %llu?\n", aa, bb, (ulong64)MULT(aa,bb), (ulong64)(aa*bb));
             return 0;
          }
   }
 }
 #endif
 #ifdef TESTING
   /* test mp_cnt_lsb */
   printf("testing mp_cnt_lsb...\n");
   mp_set(&a, 1);
@ -274,12 +199,10 @@ int main(void)
       }
       mp_mul_2(&a, &a);
   }
 #endif
 /* test mp_reduce_2k */
 #ifdef TESTING
   printf("Testing mp_reduce_2k...\n");
-   for (cnt = 3; cnt <= 384; ++cnt) {
+   for (cnt = 3; cnt <= 128; ++cnt) {
       mp_digit tmp;
       mp_2expt(&a, cnt);
       mp_sub_d(&a, 2, &a);  /* a = 2**cnt - 2 */
@ -289,7 +212,7 @@ int main(void)
       printf("(%d)", mp_reduce_is_2k(&a));
       mp_reduce_2k_setup(&a, &tmp);
       printf("(%d)", tmp);
-       for (ix = 0; ix < 10000; ix++) {
+       for (ix = 0; ix < 1000; ix++) {
           if (!(ix & 127)) {printf("."); fflush(stdout); }
           mp_rand(&b, (cnt/DIGIT_BIT  + 1) * 2);
           mp_copy(&c, &b);
@ -301,14 +224,11 @@ int main(void)
           }
        }
    }
 #endif
 /* test mp_div_3  */
 #ifdef TESTING
   printf("Testing mp_div_3...\n");
   mp_set(&d, 3);
-   for (cnt = 0; cnt < 1000000; ) {
+   for (cnt = 0; cnt < 10000; ) {
      mp_digit r1, r2;
      if (!(++cnt & 127)) printf("%9d\r", cnt);
@ -321,12 +241,10 @@ int main(void)
      }
   }
   printf("\n\nPassed div_3 testing\n");
 #endif
 /* test the DR reduction */
 #ifdef TESTING
   printf("testing mp_dr_reduce...\n");
-   for (cnt = 2; cnt < 128; cnt++) {
+   for (cnt = 2; cnt < 32; cnt++) {
       printf("%d digit modulus\n", cnt);
       mp_grow(&a, cnt);
       mp_zero(&a);
@ -334,7 +252,7 @@ int main(void)
           a.dp[ix] = MP_MASK;
       }
       a.used = cnt;
-       mp_prime_next_prime(&a, 3, 0);
+       a.dp[0] = 3;
       mp_rand(&b, cnt - 1);
       mp_copy(&b, &c);
@ -346,206 +264,16 @@ int main(void)
         mp_copy(&b, &c);
         mp_mod(&b, &a, &b);
-         mp_dr_reduce(&c, &a, (1<<DIGIT_BIT)-a.dp[0]);
+         mp_dr_reduce(&c, &a, (((mp_digit)1)<<DIGIT_BIT)-a.dp[0]);
         if (mp_cmp(&b, &c) != MP_EQ) {
            printf("Failed on trial %lu\n", rr); exit(-1);
         }
-      } while (++rr < 100000);
+      } while (++rr < 500);
      printf("Passed DR test for %d digits\n", cnt);
   }
 #endif
 #ifdef TIMER
      /* temp. turn off TOOM */
      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
      reset();
      sleep(1);
      CLK_PER_SEC = rdtsc();
      printf("CLK_PER_SEC == %lu\n", CLK_PER_SEC);
      log = fopen("logs/add.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
         rr = 0;
         do {
            DO(mp_add(&a,&b,&c));
            rr += 16;
         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt); fflush(log);
      }
      fclose(log);
      log = fopen("logs/sub.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
         rr = 0;
         do {
            DO(mp_sub(&a,&b,&c));
            rr += 16;
         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);
   /* do mult/square twice, first without karatsuba and second with */
 mult_test:   
   old_kara_m = KARATSUBA_MUL_CUTOFF;
   old_kara_s = KARATSUBA_SQR_CUTOFF;
   for (ix = 0; ix < 2; ix++) {
      printf("With%s Karatsuba\n", (ix==0)?"out":"");
      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         reset();
         rr = 0;
         do {
            DO(mp_mul(&a, &b, &c));
            rr += 16;
         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);
      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         reset();
         rr = 0;
         do {
            DO(mp_sqr(&a, &b));
            rr += 16;
         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);
   }
 expt_test:
  {
      char *primes[] = {
         /* 2K moduli mersenne primes */
         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
         /* DR moduli */
         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
         /* generic unrestricted moduli */
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
         NULL
      };
   log = fopen("logs/expt.log", "w");
   logb = fopen("logs/expt_dr.log", "w");
   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
      SLEEP;
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
      for (rr = 0; rr < mp_count_bits(&a); rr++) {
         mp_mul_2(&b, &b);
         b.dp[0] |= lbit();
         b.used  += 1;
      }
      mp_sub_d(&a, 1, &c);
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
      reset();
      rr = 0;
      do {
         DO(mp_exptmod(&c, &b, &a, &d));
         rr += 16;
      } while (rdtsc() < (CLK_PER_SEC * 2));
      tt = rdtsc();
      mp_sub_d(&a, 1, &e);
      mp_sub(&e, &b, &b);
      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
      if (mp_cmp_d(&d, 1)) {
         printf("Different (%d)!!!\n", mp_count_bits(&a));
         draw(&d);
         exit(0);
      }
      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);
   }
   }
   fclose(log);
   fclose(logb);
   fclose(logc);
   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
      SLEEP;
      mp_rand(&a, cnt);
      mp_rand(&b, cnt);
      do {
         mp_add_d(&b, 1, &b);
         mp_gcd(&a, &b, &c);
      } while (mp_cmp_d(&c, 1) != MP_EQ);
      reset();
      rr = 0;
      do {
         DO(mp_invmod(&b, &a, &c));
         rr += 16;
      } while (rdtsc() < (CLK_PER_SEC * 2));
      tt = rdtsc();
      mp_mulmod(&b, &c, &a, &d);
      if (mp_cmp_d(&d, 1) != MP_EQ) {
         printf("Failed to invert\n");
         return 0;
      }
      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);
   }
   fclose(log);
   return 0;
 #endif
   div2_n = mul2_n = inv_n = expt_n = lcm_n = gcd_n = add_n =
   sub_n = mul_n = div_n = sqr_n = mul2d_n = div2d_n = cnt = add_d_n = sub_d_n= 0;
--- a/demo/timing.c
+++ b/demo/timing.c
@ -0,0 +1,291 @@
 #include <tommath.h>
 #include <time.h>
 ulong64 _tt;
 #ifdef IOWNANATHLON
 #include <unistd.h>
 #define SLEEP sleep(4)
 #else
 #define SLEEP
 #endif
 void ndraw(mp_int *a, char *name)
 {
   char buf[4096];
   printf("%s: ", name);
   mp_toradix(a, buf, 64);
   printf("%s\n", buf);
 }
 static void draw(mp_int *a)
 {
   ndraw(a, "");
 }
 unsigned long lfsr = 0xAAAAAAAAUL;
 int lbit(void)
 {
   if (lfsr & 0x80000000UL) {
      lfsr = ((lfsr << 1) ^ 0x8000001BUL) & 0xFFFFFFFFUL;
      return 1;
   } else {
      lfsr <<= 1;
      return 0;
   }
 }
 #if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
 /* RDTSC from Scott Duplichan */
 static ulong64 TIMFUNC (void)
   {
   #if defined __GNUC__
      #ifdef __i386__
         ulong64 a;
         __asm__ __volatile__ ("rdtsc ":"=A" (a));
         return a;
      #else /* gcc-IA64 version */
         unsigned long result;
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         while (__builtin_expect ((int) result == -1, 0))
         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
         return result;
      #endif
   // Microsoft and Intel Windows compilers
   #elif defined _M_IX86
     __asm rdtsc
   #elif defined _M_AMD64
     return __rdtsc ();
   #elif defined _M_IA64
     #if defined __INTEL_COMPILER
       #include <ia64intrin.h>
     #endif
      return __getReg (3116);
   #else
     #error need rdtsc function for this build
   #endif
   }
 #else
 #define TIMFUNC clock
 #endif
 #define DO(x) x; x;
 //#define DO4(x) DO2(x); DO2(x);
 //#define DO8(x) DO4(x); DO4(x);
 //#define DO(x)  DO8(x); DO8(x);
 int main(void)
 {
   ulong64 tt, gg, CLK_PER_SEC;
   FILE *log, *logb, *logc;
   mp_int a, b, c, d, e, f;
   int n, cnt, ix, old_kara_m, old_kara_s;
   unsigned rr;
   mp_init(&a);
   mp_init(&b);
   mp_init(&c);
   mp_init(&d);
   mp_init(&e);
   mp_init(&f);
   srand(time(NULL));
      /* temp. turn off TOOM */
      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
      CLK_PER_SEC = TIMFUNC();
      sleep(1);
      CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC;
      printf("CLK_PER_SEC == %llu\n", CLK_PER_SEC);
      log = fopen("logs/add.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         rr = 0;
         tt = -1;
         do {
            gg = TIMFUNC();
            DO(mp_add(&a,&b,&c));
            gg = (TIMFUNC() - gg)>>1;
            if (tt > gg) tt = gg;
         } while (++rr < 100000);
         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt); fflush(log);
      }
      fclose(log);
      log = fopen("logs/sub.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         rr = 0;
         tt = -1;
         do {
            gg = TIMFUNC();
            DO(mp_sub(&a,&b,&c));
            gg = (TIMFUNC() - gg)>>1;
            if (tt > gg) tt = gg;
         } while (++rr < 100000);
         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);  fflush(log);
      }
      fclose(log);
   /* do mult/square twice, first without karatsuba and second with */
   old_kara_m = KARATSUBA_MUL_CUTOFF;
   old_kara_s = KARATSUBA_SQR_CUTOFF;
   for (ix = 0; ix < 1; ix++) {
      printf("With%s Karatsuba\n", (ix==0)?"out":"");
      KARATSUBA_MUL_CUTOFF = (ix==0)?9999:old_kara_m;
      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;
      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
         rr = 0;
         tt = -1;
         do {
            gg = TIMFUNC();
            DO(mp_mul(&a, &b, &c));
            gg = (TIMFUNC() - gg)>>1;
            if (tt > gg) tt = gg;
         } while (++rr < 100);
         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
      }
      fclose(log);
      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         rr = 0;
         tt = -1;
         do {
            gg = TIMFUNC();
            DO(mp_sqr(&a, &b));
            gg = (TIMFUNC() - gg)>>1;
            if (tt > gg) tt = gg;
         } while (++rr < 100);
         printf("Squaring\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
         fprintf(log, "%d %9llu\n", mp_count_bits(&a), tt);  fflush(log);
      }
      fclose(log);
   }
  {
      char *primes[] = {
         /* 2K moduli mersenne primes */
         "6864797660130609714981900799081393217269435300143305409394463459185543183397656052122559640661454554977296311391480858037121987999716643812574028291115057151",
         "531137992816767098689588206552468627329593117727031923199444138200403559860852242739162502265229285668889329486246501015346579337652707239409519978766587351943831270835393219031728127",
         "10407932194664399081925240327364085538615262247266704805319112350403608059673360298012239441732324184842421613954281007791383566248323464908139906605677320762924129509389220345773183349661583550472959420547689811211693677147548478866962501384438260291732348885311160828538416585028255604666224831890918801847068222203140521026698435488732958028878050869736186900714720710555703168729087",
         "1475979915214180235084898622737381736312066145333169775147771216478570297878078949377407337049389289382748507531496480477281264838760259191814463365330269540496961201113430156902396093989090226259326935025281409614983499388222831448598601834318536230923772641390209490231836446899608210795482963763094236630945410832793769905399982457186322944729636418890623372171723742105636440368218459649632948538696905872650486914434637457507280441823676813517852099348660847172579408422316678097670224011990280170474894487426924742108823536808485072502240519452587542875349976558572670229633962575212637477897785501552646522609988869914013540483809865681250419497686697771007",
         "259117086013202627776246767922441530941818887553125427303974923161874019266586362086201209516800483406550695241733194177441689509238807017410377709597512042313066624082916353517952311186154862265604547691127595848775610568757931191017711408826252153849035830401185072116424747461823031471398340229288074545677907941037288235820705892351068433882986888616658650280927692080339605869308790500409503709875902119018371991620994002568935113136548829739112656797303241986517250116412703509705427773477972349821676443446668383119322540099648994051790241624056519054483690809616061625743042361721863339415852426431208737266591962061753535748892894599629195183082621860853400937932839420261866586142503251450773096274235376822938649407127700846077124211823080804139298087057504713825264571448379371125032081826126566649084251699453951887789613650248405739378594599444335231188280123660406262468609212150349937584782292237144339628858485938215738821232393687046160677362909315071",
         "190797007524439073807468042969529173669356994749940177394741882673528979787005053706368049835514900244303495954950709725762186311224148828811920216904542206960744666169364221195289538436845390250168663932838805192055137154390912666527533007309292687539092257043362517857366624699975402375462954490293259233303137330643531556539739921926201438606439020075174723029056838272505051571967594608350063404495977660656269020823960825567012344189908927956646011998057988548630107637380993519826582389781888135705408653045219655801758081251164080554609057468028203308718724654081055323215860189611391296030471108443146745671967766308925858547271507311563765171008318248647110097614890313562856541784154881743146033909602737947385055355960331855614540900081456378659068370317267696980001187750995491090350108417050917991562167972281070161305972518044872048331306383715094854938415738549894606070722584737978176686422134354526989443028353644037187375385397838259511833166416134323695660367676897722287918773420968982326089026150031515424165462111337527431154890666327374921446276833564519776797633875503548665093914556482031482248883127023777039667707976559857333357013727342079099064400455741830654320379350833236245819348824064783585692924881021978332974949906122664421376034687815350484991",
         /* DR moduli */
         "14059105607947488696282932836518693308967803494693489478439861164411992439598399594747002144074658928593502845729752797260025831423419686528151609940203368612079",
         "101745825697019260773923519755878567461315282017759829107608914364075275235254395622580447400994175578963163918967182013639660669771108475957692810857098847138903161308502419410142185759152435680068435915159402496058513611411688900243039",
         "736335108039604595805923406147184530889923370574768772191969612422073040099331944991573923112581267542507986451953227192970402893063850485730703075899286013451337291468249027691733891486704001513279827771740183629161065194874727962517148100775228363421083691764065477590823919364012917984605619526140821797602431",
         "38564998830736521417281865696453025806593491967131023221754800625044118265468851210705360385717536794615180260494208076605798671660719333199513807806252394423283413430106003596332513246682903994829528690198205120921557533726473585751382193953592127439965050261476810842071573684505878854588706623484573925925903505747545471088867712185004135201289273405614415899438276535626346098904241020877974002916168099951885406379295536200413493190419727789712076165162175783",
         "542189391331696172661670440619180536749994166415993334151601745392193484590296600979602378676624808129613777993466242203025054573692562689251250471628358318743978285860720148446448885701001277560572526947619392551574490839286458454994488665744991822837769918095117129546414124448777033941223565831420390846864429504774477949153794689948747680362212954278693335653935890352619041936727463717926744868338358149568368643403037768649616778526013610493696186055899318268339432671541328195724261329606699831016666359440874843103020666106568222401047720269951530296879490444224546654729111504346660859907296364097126834834235287147",
         "1487259134814709264092032648525971038895865645148901180585340454985524155135260217788758027400478312256339496385275012465661575576202252063145698732079880294664220579764848767704076761853197216563262660046602703973050798218246170835962005598561669706844469447435461092542265792444947706769615695252256130901271870341005768912974433684521436211263358097522726462083917939091760026658925757076733484173202927141441492573799914240222628795405623953109131594523623353044898339481494120112723445689647986475279242446083151413667587008191682564376412347964146113898565886683139407005941383669325997475076910488086663256335689181157957571445067490187939553165903773554290260531009121879044170766615232300936675369451260747671432073394867530820527479172464106442450727640226503746586340279816318821395210726268291535648506190714616083163403189943334431056876038286530365757187367147446004855912033137386225053275419626102417236133948503",
         "1095121115716677802856811290392395128588168592409109494900178008967955253005183831872715423151551999734857184538199864469605657805519106717529655044054833197687459782636297255219742994736751541815269727940751860670268774903340296040006114013971309257028332849679096824800250742691718610670812374272414086863715763724622797509437062518082383056050144624962776302147890521249477060215148275163688301275847155316042279405557632639366066847442861422164832655874655824221577849928863023018366835675399949740429332468186340518172487073360822220449055340582568461568645259954873303616953776393853174845132081121976327462740354930744487429617202585015510744298530101547706821590188733515880733527449780963163909830077616357506845523215289297624086914545378511082534229620116563260168494523906566709418166011112754529766183554579321224940951177394088465596712620076240067370589036924024728375076210477267488679008016579588696191194060127319035195370137160936882402244399699172017835144537488486396906144217720028992863941288217185353914991583400421682751000603596655790990815525126154394344641336397793791497068253936771017031980867706707490224041075826337383538651825493679503771934836094655802776331664261631740148281763487765852746577808019633679",
         /* generic unrestricted moduli */
         "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
         "2893527720709661239493896562339544088620375736490408468011883030469939904368086092336458298221245707898933583190713188177399401852627749210994595974791782790253946539043962213027074922559572312141181787434278708783207966459019479487",
         "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319",
         "47266428956356393164697365098120418976400602706072312735924071745438532218237979333351774907308168340693326687317443721193266215155735814510792148768576498491199122744351399489453533553203833318691678263241941706256996197460424029012419012634671862283532342656309677173602509498417976091509154360039893165037637034737020327399910409885798185771003505320583967737293415979917317338985837385734747478364242020380416892056650841470869294527543597349250299539682430605173321029026555546832473048600327036845781970289288898317888427517364945316709081173840186150794397479045034008257793436817683392375274635794835245695887",
         "436463808505957768574894870394349739623346440601945961161254440072143298152040105676491048248110146278752857839930515766167441407021501229924721335644557342265864606569000117714935185566842453630868849121480179691838399545644365571106757731317371758557990781880691336695584799313313687287468894148823761785582982549586183756806449017542622267874275103877481475534991201849912222670102069951687572917937634467778042874315463238062009202992087620963771759666448266532858079402669920025224220613419441069718482837399612644978839925207109870840278194042158748845445131729137117098529028886770063736487420613144045836803985635654192482395882603511950547826439092832800532152534003936926017612446606135655146445620623395788978726744728503058670046885876251527122350275750995227",
         "11424167473351836398078306042624362277956429440521137061889702611766348760692206243140413411077394583180726863277012016602279290144126785129569474909173584789822341986742719230331946072730319555984484911716797058875905400999504305877245849119687509023232790273637466821052576859232452982061831009770786031785669030271542286603956118755585683996118896215213488875253101894663403069677745948305893849505434201763745232895780711972432011344857521691017896316861403206449421332243658855453435784006517202894181640562433575390821384210960117518650374602256601091379644034244332285065935413233557998331562749140202965844219336298970011513882564935538704289446968322281451907487362046511461221329799897350993370560697505809686438782036235372137015731304779072430260986460269894522159103008260495503005267165927542949439526272736586626709581721032189532726389643625590680105784844246152702670169304203783072275089194754889511973916207",
         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
         NULL
      };
   log = fopen("logs/expt.log", "w");
   logb = fopen("logs/expt_dr.log", "w");
   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
      SLEEP;
      mp_read_radix(&a, primes[n], 10);
      mp_zero(&b);
      for (rr = 0; rr < (unsigned)mp_count_bits(&a); rr++) {
         mp_mul_2(&b, &b);
         b.dp[0] |= lbit();
         b.used  += 1;
      }
      mp_sub_d(&a, 1, &c);
      mp_mod(&b, &c, &b);
      mp_set(&c, 3);
         rr = 0;
         tt = -1;
         do {
            gg = TIMFUNC();
            DO(mp_exptmod(&c, &b, &a, &d));
            gg = (TIMFUNC() - gg)>>1;
            if (tt > gg) tt = gg;
         } while (++rr < 10);
      mp_sub_d(&a, 1, &e);
      mp_sub(&e, &b, &b);
      mp_exptmod(&c, &b, &a, &e);  /* c^(p-1-b) mod a */
      mp_mulmod(&e, &d, &a, &d);   /* c^b * c^(p-1-b) == c^p-1 == 1 */
      if (mp_cmp_d(&d, 1)) {
         printf("Different (%d)!!!\n", mp_count_bits(&a));
         draw(&d);
         exit(0);
      }
      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), tt);
   }
   }
   fclose(log);
   fclose(logb);
   fclose(logc);
   log = fopen("logs/invmod.log", "w");
   for (cnt = 4; cnt <= 128; cnt += 4) {
      SLEEP;
      mp_rand(&a, cnt);
      mp_rand(&b, cnt);
      do {
         mp_add_d(&b, 1, &b);
         mp_gcd(&a, &b, &c);
      } while (mp_cmp_d(&c, 1) != MP_EQ);
         rr = 0;
         tt = -1;
      do {
         gg = TIMFUNC();
         DO(mp_invmod(&b, &a, &c));
         gg = (TIMFUNC() - gg)>>1;
         if (tt > gg) tt = gg;
      } while (++rr < 1000);
      mp_mulmod(&b, &c, &a, &d);
      if (mp_cmp_d(&d, 1) != MP_EQ) {
         printf("Failed to invert\n");
         return 0;
      }
      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu cycles\n", mp_count_bits(&a), CLK_PER_SEC/tt, tt);
      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, tt);
   }
   fclose(log);
   return 0;
 }
--- a/etc/makefile
+++ b/etc/makefile
@ -46,4 +46,5 @@ mont: mont.o
 clean:
-	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat
+	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat \
         *.da *.dyn *.dpi *~
--- a/etc/makefile.icc
+++ b/etc/makefile.icc
@ -0,0 +1,67 @@
 CC = icc
 CFLAGS += -I../
 # optimize for SPEED
 #
 # -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
 # -ax?   specifies make code specifically for ? but compatible with IA-32
 # -x?    specifies compile solely for ? [not specifically IA-32 compatible]
 #
 # where ? is 
 #   K - PIII
 #   W - first P4 [Williamette]
 #   N - P4 Northwood
 #   P - P4 Prescott
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
 CFLAGS += -O3 -xN -ip
 # default lib name (requires install with root)
 # LIBNAME=-ltommath
 # libname when you can't install the lib with install
 LIBNAME=../libtommath.a
 #provable primes
 pprime: pprime.o
 	$(CC) pprime.o $(LIBNAME) -o pprime
 # portable [well requires clock()] tuning app
 tune: tune.o
 	$(CC) tune.o $(LIBNAME) -o tune
 # same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp]
 tune86: tune.c
 	nasm -f coff timer.asm
 	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
 # for cygwin
 tune86c: tune.c
 	nasm -f gnuwin32 timer.asm
 	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o  $(LIBNAME) -o tune86
 #make tune86 for linux or any ELF format
 tune86l: tune.c
 	nasm -f elf -DUSE_ELF timer.asm
 	$(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l
 # spits out mersenne primes
 mersenne: mersenne.o
 	$(CC) mersenne.o $(LIBNAME) -o mersenne
 # fines DR safe primes for the given config
 drprime: drprime.o
 	$(CC) drprime.o $(LIBNAME) -o drprime
 # fines 2k safe primes for the given config
 2kprime: 2kprime.o
 	$(CC) 2kprime.o $(LIBNAME) -o 2kprime
 mont: mont.o
 	$(CC) mont.o $(LIBNAME) -o mont
 clean:
 	rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224  20297071
+224      1572
-448  15151383
+448      1740
-672  13088682
+672      1902
-896  11111587
+896      2116
-1120   9240621
+1120      2324
-1344   8221878
+1344      2484
-1568   7227434
+1568      2548
-1792   6718051
+1792      2772
-2016   6042524
+2016      2958
-2240   5685200
+2240      3058
-2464   5240465
+2464      3276
-2688   4818032
+2688      3436
-2912   4412794
+2912      3542
-3136   4155883
+3136      3702
-3360   3927078
+3360      3926
-3584   3722138
+3584      4074
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -1,7 +1,7 @@
-513       745
+513  19933908
-769       282
+769  55707832
-1025       130
+1025 119872576
-2049        20
+2049 856114218
-2561        11
+2561 1602741360
-3073         6
+3073 2718192748
-4097         2
+4097 6264335828
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -1,6 +1,6 @@
-521       783
+521  18847776
-607       585
+607  24665920
-1279       138
+1279 110036220
-2203        39
+2203 414562036
-3217        15
+3217 1108350966
-4253         6
+4253 2286079370
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -1,7 +1,7 @@
-532      1296
+532   9656134
-784       551
+784  23022274
-1036       283
+1036  45227854
-1540       109
+1540 129652848
-2072        52
+2072 280625626
-3080        18
+3080 845619480
-4116         7
+4116 1866206400
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -1,17 +1,17 @@
-set terminal png
+set terminal png
-set size 1.75
+set size 1.75
-set ylabel "Operations per Second"
+set ylabel "Cycles per Operation"
-set xlabel "Operand size (bits)"
+set xlabel "Operand size (bits)"
-
+
-set output "addsub.png"
+set output "addsub.png"
-plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
+plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
-
+
-set output "mult.png"
+set output "mult.png"
-plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
+plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
-
+
-set output "expt.png"
+set output "expt.png"
-plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
+plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)", 'expt_2k.log' smooth bezier title "Exptmod (2k Reduction)"
-
+
-set output "invmod.png"
+set output "invmod.png"
-plot 'invmod.log' smooth bezier title "Modular Inverse"
+plot 'invmod.log' smooth bezier title "Modular Inverse"
-
+
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +0,0 @@
 112     17364
 224      8643
 336      8867
 448      6228
 560      4737
 672      2259
 784      2899
 896      1497
 1008      1238
 1120      1010
 1232       870
 1344      1265
 1456      1102
 1568       981
 1680       539
 1792       484
 1904       722
 2016       392
 2128       604
 2240       551
 2352       511
 2464       469
 2576       263
 2688       247
 2800       227
 2912       354
 3024       336
 3136       312
 3248       296
 3360       166
 3472       155
 3584       248
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/k7/README
+++ b/logs/k7/README
@ -1,13 +0,0 @@
 To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
 Todo this type 
 make timing ; ltmtest
 in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
 After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
 them all :-)
 Have fun
 Tom
--- a/logs/k7/add.log
+++ b/logs/k7/add.log
@ -1,16 +0,0 @@
 224  11069160
 448   9156136
 672   8089755
 896   7399424
 1120   6389352
 1344   5818648
 1568   5257112
 1792   4982160
 2016   4527856
 2240   4325312
 2464   4051760
 2688   3767640
 2912   3612520
 3136   3415208
 3360   3258656
 3584   3113360
--- a/logs/k7/addsub.png
+++ b/logs/k7/addsub.png
--- a/logs/k7/expt.log
+++ b/logs/k7/expt.log
@ -1,7 +0,0 @@
 513       664
 769       256
 1025       117
 2049        17
 2561         9
 3073         5
 4097         2
--- a/logs/k7/expt.png
+++ b/logs/k7/expt.png
--- a/logs/k7/expt_dr.log
+++ b/logs/k7/expt_dr.log
@ -1,7 +0,0 @@
 532      1088
 784       460
 1036       240
 1540        92
 2072        43
 3080        15
 4116         6
--- a/logs/k7/graphs.dem
+++ b/logs/k7/graphs.dem
@ -1,17 +0,0 @@
 set terminal png color
 set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
 set output "addsub.png"
 plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
 set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
 set output "expt.png"
 plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/k7/index.html
+++ b/logs/k7/index.html
@ -1,24 +0,0 @@
 <html>
 <head>
 <title>LibTomMath Log Plots</title>
 </head>
 <body>
 <h1>Addition and Subtraction</h1>
 <center><img src=addsub.png></center>
 <hr>
 <h1>Multipliers</h1>
 <center><img src=mult.png></center>
 <hr>
 <h1>Exptmod</h1>
 <center><img src=expt.png></center>
 <hr>
 <h1>Modular Inverse</h1>
 <center><img src=invmod.png></center>
 <hr>
 </body>
 </html>
--- a/logs/k7/invmod.log
+++ b/logs/k7/invmod.log
@ -1,32 +0,0 @@
 112     16248
 224      8192
 336      5320
 448      3560
 560      2728
 672      2064
 784      1704
 896      2176
 1008      1184
 1120       976
 1232      1280
 1344      1176
 1456       624
 1568       912
 1680       504
 1792       452
 1904       658
 2016       608
 2128       336
 2240       312
 2352       288
 2464       264
 2576       408
 2688       376
 2800       354
 2912       198
 3024       307
 3136       173
 3248       162
 3360       256
 3472       145
 3584       226
--- a/logs/k7/invmod.png
+++ b/logs/k7/invmod.png
--- a/logs/k7/mult.log
+++ b/logs/k7/mult.log
@ -1,17 +0,0 @@
 896    322904
 1344    151592
 1792     90472
 2240     59984
 2688     42624
 3136     31872
 3584     24704
 4032     19704
 4480     16096
 4928     13376
 5376     11272
 5824      9616
 6272      8360
 6720      7304
 7168      1664
 7616      1472
 8064      1328
--- a/logs/k7/mult.png
+++ b/logs/k7/mult.png
--- a/logs/k7/mult_kara.log
+++ b/logs/k7/mult_kara.log
@ -1,17 +0,0 @@
 896    322872
 1344    151688
 1792     90480
 2240     59984
 2688     42656
 3136     32144
 3584     25840
 4032     21328
 4480     17856
 4928     14928
 5376     12856
 5824     11256
 6272      9880
 6720      8984
 7168      7928
 7616      7200
 8064      6576
--- a/logs/k7/sqr.log
+++ b/logs/k7/sqr.log
@ -1,17 +0,0 @@
 896    415472
 1344    223736
 1792    141232
 2240     97624
 2688     71400
 3136     54800
 3584     16904
 4032     13528
 4480     10968
 4928      9128
 5376      7784
 5824      6672
 6272      5760
 6720      5056
 7168      4440
 7616      3952
 8064      3512
--- a/logs/k7/sqr_kara.log
+++ b/logs/k7/sqr_kara.log
@ -1,17 +0,0 @@
 896    420464
 1344    224800
 1792    142808
 2240     97704
 2688     71416
 3136     54504
 3584     38320
 4032     32360
 4480     27576
 4928     23840
 5376     20688
 5824     18264
 6272     16176
 6720     14440
 7168     11688
 7616     10752
 8064      9936
--- a/logs/k7/sub.log
+++ b/logs/k7/sub.log
@ -1,16 +0,0 @@
 224   9728504
 448   8573648
 672   7488096
 896   6714064
 1120   5950472
 1344   5457400
 1568   5038896
 1792   4683632
 2016   4384656
 2240   4105976
 2464   3871608
 2688   3650680
 2912   3463552
 3136   3290016
 3360   3135272
 3584   2993848
--- a/logs/mult.log
+++ b/logs/mult.log
@ -1,33 +1,33 @@
-920    374785
+923     45612
-1142    242737
+1143     68010
-1371    176704
+1370     94894
-1596    134341
+1596    126514
-1816    105537
+1820    163014
-2044     85089
+2044    203564
-2268     70051
+2268    249156
-2490     58671
+2492    299226
-2716     49851
+2716    354138
-2937     42881
+2940    413022
-3162     37288
+3163    477406
-3387     32697
+3387    545876
-3608     28915
+3612    619044
-3836     25759
+3835    696754
-4057     23088
+4060    779174
-4284     20800
+4284    866216
-4508     18827
+4508    958100
-4730     17164
+4731   1055898
-4956     15689
+4954   1162294
-5180     14397
+5179   1267654
-5398     13260
+5404   1377572
-5628     12249
+5628   1503736
-5852     11346
+5852   1622310
-6071     10537
+6076   1746624
-6298      9812
+6299   1875390
-6522      9161
+6524   2009086
-6742      8572
+6748   2145990
-6971      8038
+6971   2289044
-7195      2915
+7196   2891644
-7419      2744
+7418   3064792
-7644      2587
+7644   3249780
-7866      2444
+7868   3455868
-8090      2311
+8092   3644238
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,33 +1,33 @@
-924    374171
+921     92388
-1147    243163
+1148     61410
-1371    177111
+1372     43799
-1596    134465
+1594     33047
-1819    105619
+1819     26913
-2044     85145
+2043     21996
-2266     70086
+2268     18453
-2488     58717
+2492     15623
-2715     49869
+2715     13378
-2939     42894
+2940     11626
-3164     37389
+3164     10252
-3387     33510
+3385      9291
-3610     29993
+3610      8348
-3836     27205
+3835      7615
-4060     24751
+4060      6928
-4281     22576
+4283      6401
-4508     20670
+4508      5836
-4732     19019
+4732      5387
-4954     17527
+4955      4985
-5180     16217
+5178      4614
-5404     15044
+5404      4300
-5624     14003
+5622      4005
-5849     13051
+5852      3742
-6076     12067
+6073      3502
-6300     11438
+6298      3262
-6524     10772
+6524      3137
-6748     10298
+6748      2967
-6972      9715
+6971      2807
-7195      9330
+7195      2679
-7416      8836
+7420      2571
-7644      8465
+7643      2442
-7864      8042
+7867      2324
-8091      7735
+8091      2235
--- a/logs/p4/README
+++ b/logs/p4/README
@ -1,13 +0,0 @@
 To use the pretty graphs you have to first build/run the ltmtest from the root directory of the package.  
 Todo this type 
 make timing ; ltmtest
 in the root.  It will run for a while [about ten minutes on most PCs] and produce a series of .log files in logs/.
 After doing that run "gnuplot graphs.dem" to make the PNGs.  If you managed todo that all so far just open index.html to view
 them all :-)
 Have fun
 Tom
--- a/logs/p4/add.log
+++ b/logs/p4/add.log
@ -1,16 +0,0 @@
 224   8113248
 448   6585584
 672   5687678
 896   4761144
 1120   4111592
 1344   3995154
 1568   3532387
 1792   3225400
 2016   2963960
 2240   2720112
 2464   2533952
 2688   2307168
 2912   2287064
 3136   2150160
 3360   2035992
 3584   1936304
--- a/logs/p4/addsub.png
+++ b/logs/p4/addsub.png
--- a/logs/p4/expt.log
+++ b/logs/p4/expt.log
@ -1,7 +0,0 @@
 513       195
 769        68
 1025        31
 2049         4
 2561         2
 3073         1
 4097         0
--- a/logs/p4/expt.png
+++ b/logs/p4/expt.png
--- a/logs/p4/expt_dr.log
+++ b/logs/p4/expt_dr.log
@ -1,7 +0,0 @@
 532       393
 784       158
 1036        79
 1540        27
 2072        12
 3080         4
 4116         1
--- a/logs/p4/graphs.dem
+++ b/logs/p4/graphs.dem
@ -1,17 +0,0 @@
 set terminal png color
 set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
 set output "addsub.png"
 plot 'add.log' smooth bezier title "Addition", 'sub.log' smooth bezier title "Subtraction"
 set output "mult.png"
 plot 'sqr.log' smooth bezier title "Squaring (without Karatsuba)", 'sqr_kara.log' smooth bezier title "Squaring (Karatsuba)", 'mult.log' smooth bezier title "Multiplication (without Karatsuba)", 'mult_kara.log' smooth bezier title "Multiplication (Karatsuba)"
 set output "expt.png"
 plot 'expt.log' smooth bezier title "Exptmod (Montgomery)", 'expt_dr.log' smooth bezier title "Exptmod (Dimminished Radix)"
 set output "invmod.png"
 plot 'invmod.log' smooth bezier title "Modular Inverse"
--- a/logs/p4/index.html
+++ b/logs/p4/index.html
@ -1,24 +0,0 @@
 <html>
 <head>
 <title>LibTomMath Log Plots</title>
 </head>
 <body>
 <h1>Addition and Subtraction</h1>
 <center><img src=addsub.png></center>
 <hr>
 <h1>Multipliers</h1>
 <center><img src=mult.png></center>
 <hr>
 <h1>Exptmod</h1>
 <center><img src=expt.png></center>
 <hr>
 <h1>Modular Inverse</h1>
 <center><img src=invmod.png></center>
 <hr>
 </body>
 </html>
--- a/logs/p4/invmod.log
+++ b/logs/p4/invmod.log
@ -1,32 +0,0 @@
 112     13608
 224      6872
 336      4264
 448      2792
 560      2144
 672      1560
 784      1296
 896      1672
 1008       896
 1120       736
 1232      1024
 1344       888
 1456       472
 1568       680
 1680       373
 1792       328
 1904       484
 2016       436
 2128       232
 2240       211
 2352       200
 2464       177
 2576       293
 2688       262
 2800       251
 2912       137
 3024       216
 3136       117
 3248       113
 3360       181
 3472        98
 3584       158
--- a/logs/p4/invmod.png
+++ b/logs/p4/invmod.png
--- a/logs/p4/mult.log
+++ b/logs/p4/mult.log
@ -1,17 +0,0 @@
 896     77600
 1344     35776
 1792     19688
 2240     13248
 2688      9424
 3136      7056
 3584      5464
 4032      4368
 4480      3568
 4928      2976
 5376      2520
 5824      2152
 6272      1872
 6720      1632
 7168       650
 7616       576
 8064       515
--- a/logs/p4/mult.png
+++ b/logs/p4/mult.png
--- a/logs/p4/mult_kara.log
+++ b/logs/p4/mult_kara.log
@ -1,17 +0,0 @@
 896     77752
 1344     35832
 1792     19688
 2240     14704
 2688     10832
 3136      8336
 3584      6600
 4032      5424
 4480      4648
 4928      3976
 5376      3448
 5824      3016
 6272      2664
 6720      2384
 7168      2120
 7616      1912
 8064      1752
--- a/logs/p4/sqr.log
+++ b/logs/p4/sqr.log
@ -1,17 +0,0 @@
 896    128088
 1344     63640
 1792     37968
 2240     25488
 2688     18176
 3136     13672
 3584      4920
 4032      3912
 4480      3160
 4928      2616
 5376      2216
 5824      1896
 6272      1624
 6720      1408
 7168      1240
 7616      1096
 8064       984
--- a/logs/p4/sqr_kara.log
+++ b/logs/p4/sqr_kara.log
@ -1,17 +0,0 @@
 896    127456
 1344     63752
 1792     37920
 2240     25440
 2688     18200
 3136     13728
 3584     10968
 4032      9072
 4480      7608
 4928      6440
 5376      5528
 5824      4768
 6272      4328
 6720      3888
 7168      3504
 7616      3176
 8064      2896
--- a/logs/p4/sub.log
+++ b/logs/p4/sub.log
@ -1,16 +0,0 @@
 224   7355896
 448   6162880
 672   5218984
 896   4622776
 1120   3999320
 1344   3629480
 1568   3290384
 1792   2954752
 2016   2737056
 2240   2563320
 2464   2451928
 2688   2310920
 2912   2139048
 3136   2034080
 3360   1890800
 3584   1808624
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -1,33 +1,33 @@
-922    471095
+924     26026
-1147    337137
+1146     37682
-1366    254327
+1370     51714
-1596    199732
+1595     68130
-1819    161225
+1820     86850
-2044    132852
+2043    107880
-2268    111493
+2267    131236
-2490     94864
+2490    156828
-2715     81745
+2716    184704
-2940     71187
+2940    214934
-3162     62575
+3162    247424
-3387     55418
+3388    282494
-3612     14540
+3608    308390
-3836     12944
+3834    345978
-4060     11627
+4060    386156
-4281     10546
+4282    427648
-4508      9502
+4505    471556
-4730      8688
+4731    517948
-4954      7937
+4954    566396
-5180      7273
+5180    618292
-5402      6701
+5402    670130
-5627      6189
+5628    725674
-5850      5733
+5852    783310
-6076      5310
+6076    843480
-6300      4933
+6300    905136
-6522      4631
+6524    969132
-6748      4313
+6748   1033680
-6971      4064
+6971   1100912
-7196      3801
+7195   1170954
-7420      3576
+7420   1252576
-7642      3388
+7643   1325038
-7868      3191
+7867   1413890
-8092      3020
+8091   1493140
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,33 +1,33 @@
-922    470930
+923    165854
-1148    337217
+1146    112539
-1372    254433
+1372     80388
-1596    199827
+1595     60051
-1820    161204
+1820     47498
-2043    132871
+2044     38017
-2267    111522
+2268     31935
-2488     94932
+2492     27373
-2714     81814
+2714     23798
-2939     71231
+2939     20630
-3164     62616
+3164     18198
-3385     55467
+3388     16191
-3611     44426
+3612     14538
-3836     40695
+3836     13038
-4060     37391
+4058     11683
-4283     34371
+4284     10915
-4508     31779
+4508      9998
-4732     29499
+4731      9271
-4956     27426
+4954      8555
-5177     25598
+5180      7910
-5403     23944
+5404      7383
-5628     22416
+5628      7012
-5851     21052
+5852      6527
-6076     19781
+6075      6175
-6299     18588
+6299      5737
-6523     17539
+6524      5398
-6746     16618
+6744      5110
-6972     15705
+6971      4864
-7196     13582
+7196      4567
-7420     13004
+7420      4371
-7643     12496
+7644      4182
-7868     11963
+7868      3981
-8092     11497
+8092      3758
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224  16370431
+224      2012
-448  13327848
+448      2208
-672  11009401
+672      2366
-896   9125342
+896      2532
-1120   7930419
+1120      2682
-1344   7114040
+1344      2838
-1568   6506998
+1568      3016
-1792   5899346
+1792      3146
-2016   5435327
+2016      3318
-2240   5038931
+2240      3538
-2464   4696364
+2464      3756
-2688   4425678
+2688      3914
-2912   4134476
+2912      4060
-3136   3913280
+3136      4216
-3360   3692536
+3360      4392
-3584   3505219
+3584      4550
--- a/40
+++ b/40
@ -12,7 +12,10 @@ CFLAGS += -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer
-VERSION=0.30
+#debug
 #CFLAGS += -g3
 VERSION=0.31
 default: libtommath.a
@ -20,7 +23,7 @@ default: libtommath.a
 LIBNAME=libtommath.a
 HEADERS=tommath.h
-#LIBPATH-The directory for libtomcrypt to be installed to.
+#LIBPATH-The directory for libtommath to be installed to.
 #INCPATH-The directory to install the header files for libtommath.
 #DATAPATH-The directory to install the pdf docs.
 DESTDIR=
@ -58,6 +61,30 @@ libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
 	ranlib libtommath.a
 #make a profiled library (takes a while!!!)
 #
 # This will build the library with profile generation
 # then run the test demo and rebuild the library.
 # 
 # So far I've seen improvements in the MP math
 profiled:
 	make CFLAGS="$(CFLAGS) -fprofile-arcs -DTESTING" timing
 	./ltmtest
 	rm -f *.a *.o ltmtest
 	make CFLAGS="$(CFLAGS) -fbranch-probabilities"
 #make a single object profiled library 
 profiled_single:
 	perl gen.pl
 	$(CC) $(CFLAGS) -fprofile-arcs -DTESTING -c mpi.c -o mpi.o
 	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/timing.c mpi.o -o ltmtest
 	./ltmtest
 	rm -f *.o ltmtest
 	$(CC) $(CFLAGS) -fbranch-probabilities -DTESTING -c mpi.c -o mpi.o
 	$(AR) $(ARFLAGS) libtommath.a mpi.o
 	ranlib libtommath.a	
 install: libtommath.a
 	install -d -g root -o root $(DESTDIR)$(LIBPATH)
 	install -d -g root -o root $(DESTDIR)$(INCPATH)
@ -71,7 +98,7 @@ mtest: test
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest -s
 timing: libtommath.a
-	$(CC) $(CFLAGS) -DTIMER demo/demo.c libtommath.a -o ltmtest -s
+	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest -s
 # makes the LTM book DVI file, requires tetex, perl and makeindex [part of tetex I think]
 docdvi: tommath.src
@ -106,10 +133,13 @@ mandvi: bn.tex
 manual:	mandvi
 	pdflatex bn >/dev/null
 	rm -f bn.aux bn.dvi bn.log bn.idx bn.lof bn.out bn.toc
-	
+
 pretty: 
 	perl pretty.build
 clean:
 	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
-        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c 
+        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.da *.dyn *.dpi tommath.tex *~ demo/*~ etc/*~
 	cd etc ; make clean
 	cd pics ; make clean
--- a/makefile.bcc
+++ b/makefile.bcc
@ -30,7 +30,8 @@ bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
 bn_mp_init_set.obj bn_mp_init_set_int.obj 
 TARGET = libtommath.lib
--- a/makefile.cygwin_dll
+++ b/makefile.cygwin_dll
@ -35,7 +35,8 @@ bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
-bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
 bn_mp_init_set_int.o
 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
--- a/makefile.icc
+++ b/makefile.icc
@ -0,0 +1,110 @@
 #Makefile for ICC
 #
 #Tom St Denis
 CC=icc
 CFLAGS  +=  -I./
 # optimize for SPEED
 #
 # -mcpu= can be pentium, pentiumpro (covers PII through PIII) or pentium4
 # -ax?   specifies make code specifically for ? but compatible with IA-32
 # -x?    specifies compile solely for ? [not specifically IA-32 compatible]
 #
 # where ? is 
 #   K - PIII
 #   W - first P4 [Williamette]
 #   N - P4 Northwood
 #   P - P4 Prescott
 #   B - Blend of P4 and PM [mobile]
 #
 # Default to just generic max opts
 CFLAGS += -O3 -xN
 default: libtommath.a
 #default files to install
 LIBNAME=libtommath.a
 HEADERS=tommath.h
 #LIBPATH-The directory for libtomcrypt to be installed to.
 #INCPATH-The directory to install the header files for libtommath.
 #DATAPATH-The directory to install the pdf docs.
 DESTDIR=
 LIBPATH=/usr/lib
 INCPATH=/usr/include
 DATAPATH=/usr/share/doc/libtommath/pdf
 OBJECTS=bncore.o bn_mp_init.o bn_mp_clear.o bn_mp_exch.o bn_mp_grow.o bn_mp_shrink.o \
 bn_mp_clamp.o bn_mp_zero.o  bn_mp_set.o bn_mp_set_int.o bn_mp_init_size.o bn_mp_copy.o \
 bn_mp_init_copy.o bn_mp_abs.o bn_mp_neg.o bn_mp_cmp_mag.o bn_mp_cmp.o bn_mp_cmp_d.o \
 bn_mp_rshd.o bn_mp_lshd.o bn_mp_mod_2d.o bn_mp_div_2d.o bn_mp_mul_2d.o bn_mp_div_2.o \
 bn_mp_mul_2.o bn_s_mp_add.o bn_s_mp_sub.o bn_fast_s_mp_mul_digs.o bn_s_mp_mul_digs.o \
 bn_fast_s_mp_mul_high_digs.o bn_s_mp_mul_high_digs.o bn_fast_s_mp_sqr.o bn_s_mp_sqr.o \
 bn_mp_add.o bn_mp_sub.o bn_mp_karatsuba_mul.o bn_mp_mul.o bn_mp_karatsuba_sqr.o \
 bn_mp_sqr.o bn_mp_div.o bn_mp_mod.o bn_mp_add_d.o bn_mp_sub_d.o bn_mp_mul_d.o \
 bn_mp_div_d.o bn_mp_mod_d.o bn_mp_expt_d.o bn_mp_addmod.o bn_mp_submod.o \
 bn_mp_mulmod.o bn_mp_sqrmod.o bn_mp_gcd.o bn_mp_lcm.o bn_fast_mp_invmod.o bn_mp_invmod.o \
 bn_mp_reduce.o bn_mp_montgomery_setup.o bn_fast_mp_montgomery_reduce.o bn_mp_montgomery_reduce.o \
 bn_mp_exptmod_fast.o bn_mp_exptmod.o bn_mp_2expt.o bn_mp_n_root.o bn_mp_jacobi.o bn_reverse.o \
 bn_mp_count_bits.o bn_mp_read_unsigned_bin.o bn_mp_read_signed_bin.o bn_mp_to_unsigned_bin.o \
 bn_mp_to_signed_bin.o bn_mp_unsigned_bin_size.o bn_mp_signed_bin_size.o  \
 bn_mp_xor.o bn_mp_and.o bn_mp_or.o bn_mp_rand.o bn_mp_montgomery_calc_normalization.o \
 bn_mp_prime_is_divisible.o bn_prime_tab.o bn_mp_prime_fermat.o bn_mp_prime_miller_rabin.o \
 bn_mp_prime_is_prime.o bn_mp_prime_next_prime.o bn_mp_dr_reduce.o \
 bn_mp_dr_is_modulus.o bn_mp_dr_setup.o bn_mp_reduce_setup.o \
 bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
 bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
 bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
 bn_mp_init_set_int.o
 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
 	ranlib libtommath.a
 #make a profiled library (takes a while!!!)
 #
 # This will build the library with profile generation
 # then run the test demo and rebuild the library.
 # 
 # So far I've seen improvements in the MP math
 profiled:
 	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_gen -DTESTING" timing
 	./ltmtest
 	rm -f *.a *.o ltmtest
 	make -f makefile.icc CFLAGS="$(CFLAGS) -prof_use"
 #make a single object profiled library 
 profiled_single:
 	perl gen.pl
 	$(CC) $(CFLAGS) -prof_gen -DTESTING -c mpi.c -o mpi.o
 	$(CC) $(CFLAGS) -DTESTING -DTIMER demo/demo.c mpi.o -o ltmtest
 	./ltmtest
 	rm -f *.o ltmtest
 	$(CC) $(CFLAGS) -prof_use -ip -DTESTING -c mpi.c -o mpi.o
 	$(AR) $(ARFLAGS) libtommath.a mpi.o
 	ranlib libtommath.a	
 install: libtommath.a
 	install -d -g root -o root $(DESTDIR)$(LIBPATH)
 	install -d -g root -o root $(DESTDIR)$(INCPATH)
 	install -g root -o root $(LIBNAME) $(DESTDIR)$(LIBPATH)
 	install -g root -o root $(HEADERS) $(DESTDIR)$(INCPATH)
 test: libtommath.a demo/demo.o
 	$(CC) demo/demo.o libtommath.a -o test
 mtest: test	
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest
 timing: libtommath.a
 	$(CC) $(CFLAGS) -DTIMER demo/timing.c libtommath.a -o ltmtest
 clean:
 	rm -f *.bat *.pdf *.o *.a *.obj *.lib *.exe *.dll etclib/*.o demo/demo.o test ltmtest mpitest mtest/mtest mtest/mtest.exe \
        *.idx *.toc *.log *.aux *.dvi *.lof *.ind *.ilg *.ps *.log *.s mpi.c *.il etc/*.il *.dyn
 	cd etc ; make clean
 	cd pics ; make clean
--- a/makefile.msvc
+++ b/makefile.msvc
@ -29,7 +29,8 @@ bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
 bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
-bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj \
 bn_mp_init_set.obj bn_mp_init_set_int.obj
 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
--- a/poster.pdf
+++ b/poster.pdf
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -452,7 +452,7 @@ fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
  }
  /* setup dest */
-  olduse = c->used;
+  olduse  = c->used;
  c->used = digs;
  {
@ -779,7 +779,7 @@ mp_2expt (mp_int * a, int b)
  a->used = b / DIGIT_BIT + 1;
  /* put the single bit in its place */
-  a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+  a->dp[b / DIGIT_BIT] = ((mp_digit)1) << (b % DIGIT_BIT);
  return MP_OKAY;
 }
@ -1142,10 +1142,14 @@ mp_clamp (mp_int * a)
 void
 mp_clear (mp_int * a)
 {
  int i;
  /* only do anything if a hasn't been freed previously */
  if (a->dp != NULL) {
    /* first zero the digits */
-    memset (a->dp, 0, sizeof (mp_digit) * a->used);
+    for (i = 0; i < a->used; i++) {
        a->dp[i] = 0;
    }
    /* free ram */
    XFREE(a->dp);
@ -1677,7 +1681,7 @@ int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d)
   */
  /* get sign before writing to c */
-  x.sign = a->sign;
+  x.sign = x.used == 0 ? MP_ZPOS : a->sign;
  if (c != NULL) {
    mp_clamp (&q);
@ -3083,15 +3087,22 @@ int mp_grow (mp_int * a, int size)
 */
 #include <tommath.h>
-/* init a new bigint */
+/* init a new mp_int */
 int mp_init (mp_int * a)
 {
  int i;
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XMALLOC (sizeof (mp_digit) * MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }
  /* set the digits to zero */
  for (i = 0; i < MP_PREC; i++) {
      a->dp[i] = 0;
  }
  /* set the used to zero, allocated digits to the default precision
   * and sign to positive */
  a->used  = 0;
@ -3753,9 +3764,6 @@ int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
    goto X0Y0;
  /* now shift the digits */
  x0.sign = x1.sign = a->sign;
  y0.sign = y1.sign = b->sign;
  x0.used = y0.used = B;
  x1.used = a->used - B;
  y1.used = b->used - B;
@ -4484,7 +4492,7 @@ int mp_mul (mp_int * a, mp_int * b, mp_int * c)
      res = s_mp_mul (a, b, c);
    }
  }
-  c->sign = neg;
+  c->sign = (c->used > 0) ? neg : MP_ZPOS;
  return res;
 }
@ -6090,7 +6098,8 @@ mp_reduce_2k_setup(mp_int *a, mp_digit *d)
 /* determines if mp_reduce_2k can be used */
 int mp_reduce_is_2k(mp_int *a)
 {
-   int ix, iy, iz, iw;
+   int ix, iy, iw;
   mp_digit iz;
   if (a->used == 0) {
      return 0;
@ -6107,7 +6116,7 @@ int mp_reduce_is_2k(mp_int *a)
             return 0;
          }
          iz <<= 1;
-          if (iz > (int)MP_MASK) {
+          if (iz > (mp_digit)MP_MASK) {
             ++iw;
             iz = 1;
          }
@ -8396,14 +8405,16 @@ s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
 CPU                    /Compiler     /MUL CUTOFF/SQR CUTOFF
 -------------------------------------------------------------
- Intel P4               /GCC v3.2     /        70/       108
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        81/profiled build
- AMD Athlon XP          /GCC v3.2     /       109/       127
+ Intel P4 Northwood     /GCC v3.3.3   /        59/        80/profiled_single build
-
+ Intel P4 Northwood     /ICC v8.0     /        57/        70/profiled build
 Intel P4 Northwood     /ICC v8.0     /        54/        76/profiled_single build
 AMD Athlon XP          /GCC v3.2     /       109/       127/
 */
-/* configured for a AMD XP Thoroughbred core with etc/tune.c */
+int     KARATSUBA_MUL_CUTOFF = 57,      /* Min. number of digits before Karatsuba multiplication is used. */
-int     KARATSUBA_MUL_CUTOFF = 109,      /* Min. number of digits before Karatsuba multiplication is used. */
+        KARATSUBA_SQR_CUTOFF = 70,      /* Min. number of digits before Karatsuba squaring is used. */
        KARATSUBA_SQR_CUTOFF = 127,      /* Min. number of digits before Karatsuba squaring is used. */
        TOOM_MUL_CUTOFF      = 350,      /* no optimal values of these are known yet so set em high */
        TOOM_SQR_CUTOFF      = 400; 
--- a/pretty.build
+++ b/pretty.build
@ -0,0 +1,66 @@
 #!/bin/perl -w
 #
 # Cute little builder for perl 
 # Total waste of development time...
 #
 # This will build all the object files and then the archive .a file
 # requires GCC, GNU make and a sense of humour.
 #
 # Tom St Denis
 use strict;
 my $count = 0;
 my $starttime = time;
 my $rate  = 0;
 print "Scanning for source files...\n";
 foreach my $filename (glob "*.c") {
       ++$count;
 }
 print "Source files to build: $count\nBuilding...\n";
 my $i = 0;
 my $lines = 0;
 my $filesbuilt = 0;
 foreach my $filename (glob "*.c") {
       printf("Building %3.2f%%, ", (++$i/$count)*100.0);
       if ($i % 4 == 0) { print "/, "; }
       if ($i % 4 == 1) { print "-, "; }
       if ($i % 4 == 2) { print "\\, "; }
       if ($i % 4 == 3) { print "|, "; }
       if ($rate > 0) {
           my $tleft = ($count - $i) / $rate;
           my $tsec  = $tleft%60;
           my $tmin  = ($tleft/60)%60;
           my $thour = ($tleft/3600)%60;
           printf("%2d:%02d:%02d left, ", $thour, $tmin, $tsec);
       }
       my $cnt = ($i/$count)*30.0;
       my $x   = 0;
       print "[";
       for (; $x < $cnt; $x++) { print "#"; }
       for (; $x < 30; $x++)   { print " "; }
       print "]\r";
       my $tmp = $filename;
       $tmp =~ s/\.c/".o"/ge;
       if (open(SRC, "<$tmp")) {
          close SRC;
       } else {
          !system("make $tmp > /dev/null 2>/dev/null") or die "\nERROR: Failed to make $tmp!!!\n";
          open( SRC, "<$filename" ) or die "Couldn't open $filename for reading: $!";
          ++$lines while (<SRC>);
          close SRC or die "Error closing $filename after reading: $!";
          ++$filesbuilt;
       }      
       # update timer 
       if (time != $starttime) {
          my $delay = time - $starttime;
          $rate = $i/$delay;
       }
 }
 # finish building the library 
 printf("\nFinished building source (%d seconds, %3.2f files per second).\n", time - $starttime, $rate);
 print "Compiled approximately $filesbuilt files and $lines lines of code.\n";
 print "Doing final make (building archive...)\n";
 !system("make > /dev/null 2>/dev/null") or die "\nERROR: Failed to perform last make command!!!\n";
 print "done.\n";
--- a/tommath.pdf
+++ b/tommath.pdf
--- a/tommath.src
+++ b/tommath.src
@ -258,7 +258,7 @@ floating point is meant to be implemented in hardware the precision of the manti
 a mantissa of much larger precision than hardware alone can efficiently support.  This approach could be useful where 
 scientific applications must minimize the total output error over long calculations.
-Another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
+Yet another use for large integers is within arithmetic on polynomials of large characteristic (i.e. $GF(p)[x]$ for large $p$).
 In fact the library discussed within this text has already been used to form a polynomial basis library\footnote{See \url{http://poly.libtomcrypt.org} for more details.}.
 \subsection{Benefits of Multiple Precision Arithmetic}
@ -316,7 +316,7 @@ the reader how the algorithms fit together as well as where to start on various
 \section{Discussion and Notation}
 \subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1} ... x_1 x_0)_{ \beta }$ and represent
+A multiple precision integer of $n$-digits shall be denoted as $x = (x_{n-1}, \ldots, x_1, x_0)_{ \beta }$ and represent
 the integer $x \equiv \sum_{i=0}^{n-1} x_i\beta^i$.  The elements of the array $x$ are said to be the radix $\beta$ digits 
 of the integer.  For example, $x = (1,2,3)_{10}$ would represent the integer 
 $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
@ -339,12 +339,11 @@ algorithms will be used to establish the relevant theory which will subsequently
 precision algorithm to solve the same problem.  
 \subsection{Precision Notation}
-For the purposes of this text a single precision variable must be able to represent integers in the range 
+The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
-$0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
+must be of the form $q^p$ for $q, p \in \Z^+$.  A single precision variable must be able to represent integers in 
-$0 \le x < q \beta^2$.  The variable $\beta$ represents the radix of a single digit of a multiple precision integer and 
+the range $0 \le x < q \beta$ while a double precision variable must be able to represent integers in the range 
-must be of the form $q^p$ for $q, p \in \Z^+$.  The extra radix-$q$ factor allows additions and subtractions to proceed 
+$0 \le x < q \beta^2$.  The extra radix-$q$ factor allows additions and subtractions to proceed without truncation of the 
-without truncation of the carry.  Since all modern computers are binary, it is assumed that $q$ is two, for all intents 
+carry.  Since all modern computers are binary, it is assumed that $q$ is two.
 and purposes.
 \index{mp\_digit} \index{mp\_word}
 Within the source code that will be presented for each algorithm, the data type \textbf{mp\_digit} will represent 
@ -376,7 +375,7 @@ the $/$ division symbol is used the intention is to perform an integer division
 $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When an expression is written as a 
 fraction a real value division is implied, for example ${5 \over 2} = 2.5$.  
-The norm of a multiple precision integer, for example, $\vert \vert x \vert \vert$ will be used to represent the number of digits in the representation
+The norm of a multiple precision integer, for example $\vert \vert x \vert \vert$, will be used to represent the number of digits in the representation
 of the integer.  For example, $\vert \vert 123 \vert \vert = 3$ and $\vert \vert 79452 \vert \vert = 5$.  
 \subsection{Work Effort}
@ -569,7 +568,7 @@ By building outwards from a base foundation instead of using a parallel design m
 highly modular.  Being highly modular is a desirable property of any project as it often means the resulting product
 has a small footprint and updates are easy to perform.  
-Usually when I start a project I will begin with the header file.  I define the data types I think I will need and 
+Usually when I start a project I will begin with the header files.  I define the data types I think I will need and 
 prototype the initial functions that are not dependent on other functions (within the library).  After I 
 implement these base functions I prototype more dependent functions and implement them.   The process repeats until
 I implement all of the functions I require.  For example, in the case of LibTomMath I implemented functions such as 
@ -619,14 +618,26 @@ any such data type but it does provide for making composite data types known as
 used within LibTomMath.
 \index{mp\_int}
-\begin{verbatim}
+\begin{figure}[here]
-typedef struct  {
+\begin{center}
-    int used, alloc, sign;
+\begin{small}
-    mp_digit *dp;
+%\begin{verbatim}
-} mp_int;
+\begin{tabular}{|l|}
-\end{verbatim}
+\hline
 typedef struct \{ \\
 \hspace{3mm}int used, alloc, sign;\\
 \hspace{3mm}mp\_digit *dp;\\
 \} \textbf{mp\_int}; \\
 \hline
 \end{tabular}
 %\end{verbatim}
 \end{small}
 \caption{The mp\_int Structure}
 \label{fig:mpint}
 \end{center}
 \end{figure}
-The mp\_int structure can be broken down as follows.
+The mp\_int structure (fig. \ref{fig:mpint}) can be broken down as follows.
 \begin{enumerate}
 \item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
@ -701,9 +712,10 @@ fault by dereferencing memory not owned by the application.
 In the case of LibTomMath the only errors that are checked for are related to inappropriate inputs (division by zero for 
 instance) and memory allocation errors.  It will not check that the mp\_int passed to any function is valid nor 
 will it check pointers for validity.  Any function that can cause a runtime error will return an error code as an 
-\textbf{int} data type with one of the following values.
+\textbf{int} data type with one of the following values (fig \ref{fig:errcodes}).
 \index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
 \begin{figure}[here]
 \begin{center}
 \begin{tabular}{|l|l|}
 \hline \textbf{Value} & \textbf{Meaning} \\
@ -713,6 +725,9 @@ will it check pointers for validity.  Any function that can cause a runtime erro
 \hline
 \end{tabular}
 \end{center}
 \caption{LibTomMath Error Codes}
 \label{fig:errcodes}
 \end{figure}
 When an error is detected within a function it should free any memory it allocated, often during the initialization of
 temporary mp\_ints, and return as soon as possible.  The goal is to leave the system in the same state it was when the 
@ -748,6 +763,7 @@ to zero.  The \textbf{used} count set to zero and \textbf{sign} set to \textbf{M
 An mp\_int is said to be initialized if it is set to a valid, preferably default, state such that all of the members of the
 structure are set to valid values.  The mp\_init algorithm will perform such an action.
 \index{mp\_init}
 \begin{figure}[here]
 \begin{center}
 \begin{tabular}{l}
@ -770,17 +786,23 @@ structure are set to valid values.  The mp\_init algorithm will perform such an
 \end{figure}
 \textbf{Algorithm mp\_init.}
-The \textbf{MP\_PREC} name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
+The purpose of this function is to initialize an mp\_int structure so that the rest of the library can properly
-used to dictate the minimum precision of allocated mp\_int integers.  Ideally, it is at least equal to $32$ since for most
+manipulte it.  It is assumed that the input may not have had any of its members previously initialized which is certainly
-purposes that will be more than enough.
+a valid assumption if the input resides on the stack.  
-Memory for the default number of digits is allocated first.  If the allocation fails the algorithm returns immediately
+Before any of the members such as \textbf{sign}, \textbf{used} or \textbf{alloc} are initialized the memory for
-with the \textbf{MP\_MEM} error code.  If the allocation succeeds the remaining members of the mp\_int structure
+the digits is allocated.  If this fails the function returns before setting any of the other members.  The \textbf{MP\_PREC} 
-must be initialized to reflect the default initial state.
+name represents a constant\footnote{Defined in the ``tommath.h'' header file within LibTomMath.} 
 used to dictate the minimum precision of newly initialized mp\_int integers.  Ideally, it is at least equal to the smallest
 precision number you'll be working with.
-The allocated digits are all set to zero (step three) to ensure they are in a known state.  The \textbf{sign}, \textbf{used}
+Allocating a block of digits at first instead of a single digit has the benefit of lowering the number of usually slow
-and \textbf{alloc} are subsequently initialized to represent the zero integer.  By step seven the algorithm returns a success 
+heap operations later functions will have to perform in the future.  If \textbf{MP\_PREC} is set correctly the slack 
-code and the mp\_int $a$ has been successfully initialized to a valid state representing the integer zero.  
+memory and the number of heap operations will be trivial.
 Once the allocation has been made the digits have to be set to zero as well as the \textbf{used}, \textbf{sign} and
 \textbf{alloc} members initialized.  This ensures that the mp\_int will always represent the default state of zero regardless
 of the original condition of the input.
 \textbf{Remark.}
 This function introduces the idiosyncrasy that all iterative loops, commonly initiated with the ``for'' keyword, iterate incrementally
@ -796,19 +818,21 @@ One immediate observation of this initializtion function is that it does not ret
 is assumed that the caller has already allocated memory for the mp\_int structure, typically on the application stack.  The 
 call to mp\_init() is used only to initialize the members of the structure to a known default state.  
-Before any of the other members of the structure are initialized memory from the application heap is allocated with
+Here we see (line @23,XMALLOC@) the memory allocation is performed first.  This allows us to exit cleanly and quickly
-the calloc() function (line @22,calloc@).  The size of the allocated memory is large enough to hold \textbf{MP\_PREC} 
+if there is an error.  If the allocation fails the routine will return \textbf{MP\_MEM} to the caller to indicate there
-mp\_digit variables.  The calloc() function is used instead\footnote{calloc() will allocate memory in the same
+was a memory error.  The function XMALLOC is what actually allocates the memory.  Technically XMALLOC is not a function
-manner as malloc() except that it also sets the contents to zero upon successfully allocating the memory.} of malloc() 
+but a macro defined in ``tommath.h``.  By default, XMALLOC will evaluate to malloc() which is the C library's built--in
-since digits have to be set to zero for the function to finish correctly.  The \textbf{OPT\_CAST} token is a macro 
+memory allocation routine.
 definition which will turn into a cast from void * to mp\_digit * for C++ compilers.  It is not required for C compilers.
-After the memory has been successfully allocated the remainder of the members are initialized 
+In order to assure the mp\_int is in a known state the digits must be set to zero.  On most platforms this could have been
 accomplished by using calloc() instead of malloc().  However,  to correctly initialize a integer type to a given value in a 
 portable fashion you have to actually assign the value.  The for loop (line @28,for@) performs this required
 operation.
 After the memory has been successfully initialized the remainder of the members are initialized 
 (lines @29,used@ through @31,sign@) to their respective default states.  At this point the algorithm has succeeded and
-a success code is returned to the calling function.
+a success code is returned to the calling function.  If this function returns \textbf{MP\_OKAY} it is safe to assume the 
-
+mp\_int structure has been properly initialized and is safe to use with other functions within the library.  
 If this function returns \textbf{MP\_OKAY} it is safe to assume the mp\_int structure has been properly initialized and
 is safe to use with other functions within the library.  
 \subsection{Clearing an mp\_int}
 When an mp\_int is no longer required by the application, the memory that has been allocated for its digits must be 
@ -819,7 +843,7 @@ returned to the application's memory pool with the mp\_clear algorithm.
 \begin{tabular}{l}
 \hline Algorithm \textbf{mp\_clear}. \\
 \textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ is freed for reuse.  \\
+\textbf{Output}.  The memory for $a$ shall be deallocated.  \\
 \hline \\
 1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
 2.  for $n$ from 0 to $a.used - 1$ do \\
@ -836,32 +860,31 @@ returned to the application's memory pool with the mp\_clear algorithm.
 \end{figure}
 \textbf{Algorithm mp\_clear.}
-This algorithm releases the memory allocated for an mp\_int back into the memory pool for reuse.  It is designed
+This algorithm accomplishes two goals.  First, it clears the digits and the other mp\_int members.  This ensures that 
-such that a given mp\_int structure can be cleared multiple times between initializations without attempting to 
+if a developer accidentally re-uses a cleared structure it is less likely to cause problems.  The second goal
-free the memory twice\footnote{In ISO C for example, calling free() twice on the same memory block causes undefinied
+is to free the allocated memory.
 behaviour.}.  
-The first step determines if the mp\_int structure has been marked as free already.  If it has, the algorithm returns
+The logic behind the algorithm is extended by marking cleared mp\_int structures so that subsequent calls to this
-success immediately as no further actions are required.  Otherwise, the algorithm will proceed to put the structure 
+algorithm will not try to free the memory multiple times.  Cleared mp\_ints are detectable by having a pre-defined invalid 
-in a known empty and otherwise invalid state.  First the digits of the mp\_int are set to zero.  The memory that has been allocated for the 
+digit pointer \textbf{dp} setting.
 digits is then freed.  The \textbf{used} and \textbf{alloc} counts are both set to zero and the \textbf{sign} set to 
 \textbf{MP\_ZPOS}.  This known fixed state for cleared mp\_int structures will make debuging easier for the end 
 developer.  That is, if they spot (via their debugger) an mp\_int they are using that is in this state it will be 
 obvious that they erroneously and prematurely cleared the mp\_int structure.
-Note that once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
+Once an mp\_int has been cleared the mp\_int structure is no longer in a valid state for any other algorithm
 with the exception of algorithms mp\_init, mp\_init\_copy, mp\_init\_size and mp\_clear.
 EXAM,bn_mp_clear.c
-The ``if'' statement (line @21,a->dp != NULL@) prevents the heap from being corrupted if a user double-frees an 
+The algorithm only operates on the mp\_int if it hasn't been previously cleared.  The if statement (line @23,a->dp != NULL@)
-mp\_int.  This is because once the memory is freed the pointer is set to \textbf{NULL} (line @30,NULL@).  
+checks to see if the \textbf{dp} member is not \textbf{NULL}.  If the mp\_int is a valid mp\_int then \textbf{dp} cannot be
 \textbf{NULL} in which case the if statement will evaluate to true.
-Without the check, code that accidentally calls mp\_clear twice for a given mp\_int structure would try to free the memory 
+The digits of the mp\_int are cleared by the for loop (line @25,for@) which assigns a zero to every digit.  Similar to mp\_init()
-allocated for the digits twice.  This may cause some C libraries to signal a fault.  By setting the pointer to 
+the digits are assigned zero instead of using block memory operations (such as memset()) since this is more portable.  
-\textbf{NULL} it helps debug code that may inadvertently free the mp\_int before it is truly not needed, because attempts 
+
-to reference digits should fail immediately. The allocated digits are set to zero before being freed (line @24,memset@).  
+The digits are deallocated off the heap via the XFREE macro.  Similar to XMALLOC the XFREE macro actually evaluates to
-This is ideal for cryptographic situations where the integer that the mp\_int represents might need to be kept a secret.
+a standard C library function.  In this case the free() function.  Since free() only deallocates the memory the pointer
 still has to be reset to \textbf{NULL} manually (line @33,NULL@).  
 Now that the digits have been cleared and deallocated the other members are set to their final values (lines @34,= 0@ and @35,ZPOS@).
 \section{Maintenance Algorithms}
@ -889,7 +912,7 @@ must be re-sized appropriately to accomodate the result.  The mp\_grow algorithm
 1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
 2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
 3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-Allocate the array of digits $a$ to size $v$ \\
+4.  Re-allocate the array of digits $a$ to size $v$ \\
 5.  If the allocation failed then return(\textit{MP\_MEM}). \\
 6.  for n from a.alloc to $v - 1$ do  \\
 \hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
@ -914,15 +937,19 @@ assumed to contain undefined values they are initially set to zero.
 EXAM,bn_mp_grow.c
-The first step is to see if we actually need to perform a re-allocation at all (line @24,a->alloc < size@).  If a reallocation
+A quick optimization is to first determine if a memory re-allocation is required at all.  The if statement (line @23,if@) checks
-must occur the digit count is padded upwards to help prevent many trivial reallocations (line @28,size@).  Next the reallocation is performed
+if the \textbf{alloc} member of the mp\_int is smaller than the requested digit count.  If the count is not larger than \textbf{alloc}
-and the return of realloc() is stored in a temporary pointer named $tmp$ (line @36,realloc@).  The return is stored in a temporary
+the function skips the re-allocation part thus saving time.
 instead of $a.dp$ to prevent the code from losing the original pointer in case the reallocation fails.  Had the return been stored 
 in $a.dp$ instead there would be no way to reclaim the heap originally used.
-If the reallocation fails the function will return \textbf{MP\_MEM} (line @39,return@), otherwise, the value of $tmp$ is assigned
+When a re-allocation is performed it is turned into an optimal request to save time in the future.  The requested digit count is
-to the pointer $a.dp$ and the function continues.  A simple for loop from line @48,a->alloc@ to line @50,}@ will zero all digits 
+padded upwards to 2nd multiple of \textbf{MP\_PREC} larger than \textbf{alloc} (line @25, size@).  The XREALLOC function is used
-that were above the old \textbf{alloc} limit to make sure the integer is in a known state.
+to re-allocate the memory.  As per the other functions XREALLOC is actually a macro which evaluates to realloc by default.  The realloc
 function leaves the base of the allocation intact which means the first \textbf{alloc} digits of the mp\_int are the same as before
 the re-allocation.  All	that is left is to clear the newly allocated digits and return.
 Note that the re-allocation result is actually stored in a temporary pointer $tmp$.  This is to allow this function to return
 an error with a valid pointer.  Earlier releases of the library stored the result of XREALLOC into the mp\_int $a$.  That would
 result in a memory leak if XREALLOC ever failed.  
 \subsection{Initializing Variable Precision mp\_ints}
 Occasionally the number of digits required will be known in advance of an initialization, based on, for example, the size 
@ -970,7 +997,7 @@ The number of digits $b$ requested is padded (line @22,MP_PREC@) by first augmen
 mp\_int is placed in a default state representing the integer zero.  Otherwise, the error code \textbf{MP\_MEM} will be 
 returned (line @27,return@).  
-The digits are allocated and set to zero at the same time with the calloc() function (line @25,calloc@).  The 
+The digits are allocated and set to zero at the same time with the calloc() function (line @25,XCALLOC@).  The 
 \textbf{used} count is set to zero, the \textbf{alloc} count set to the padded digit count and the \textbf{sign} flag set 
 to \textbf{MP\_ZPOS} to achieve a default valid mp\_int state (lines @29,used@, @30,alloc@ and @31,sign@).  If the function 
 returns succesfully then it is correct to assume that the mp\_int structure is in a valid state for the remainder of the 
--- a/tommath.tex
+++ b/tommath.tex
-17364
-8643
-8867
-6228
-4737
-2259
-2899
-1497
-1238
-1010
-870
-1265
-1102
-981
-539
-484
-722
-392
-604
-551
-511
-469
-263
-247
-227
-354
-336
-312
-296
-166
-155
-248
-664
-256
-117
-17
-9
-5
-2
-1088
-460
-240
-92
-43
-15
-6
-16248
-8192
-5320
-3560
-2728
-2064
-1704
-2176
-1184
-976
-1280
-1176
-624
-912
-504
-452
-658
-608
-336
-312
-288
-264
-408
-376
-354
-198
-307
-173
-162
-256
-145
-226
-322904
-151592
-90472
-59984
-42624
-31872
-24704
-19704
-16096
-13376
-11272
-9616
-8360
-7304
-1664
-1472
-1328
-322872
-151688
-90480
-59984
-42656
-32144
-25840
-21328
-17856
-14928
-12856
-11256
-9880
-8984
-7928
-7200
-6576
-415472
-223736
-141232
-97624
-71400
-54800
-16904
-13528
-10968
-9128
-7784
-6672
-5760
-5056
-4440
-3952
-3512
-420464
-224800
-142808
-97704
-71416
-54504
-38320
-32360
-27576
-23840
-20688
-18264
-16176
-14440
-11688
-10752
-9936
-9728504
-8573648
-7488096
-6714064
-5950472
-5457400
-5038896
-4683632
-4384656
-4105976
-3871608
-3650680
-3463552
-3290016
-3135272
-2993848
-374785
+45612
-242737
+68010
-176704
+94894
-134341
+126514
-105537
+163014
-85089
+203564
-70051
+249156
-58671
+299226
-49851
+354138
-42881
+413022
-37288
+477406
-32697
+545876
-28915
+619044
-25759
+696754
-23088
+779174
-20800
+866216
-18827
+958100
-17164
+1055898
-15689
+1162294
-14397
+1267654
-13260
+1377572
-12249
+1503736
-11346
+1622310
-10537
+1746624
-9812
+1875390
-9161
+2009086
-8572
+2145990
-8038
+2289044
-2915
+2891644
-2744
+3064792
-2587
+3249780
-2444
+3455868
-2311
+3644238