diff --git a/b.bat b/b.bat
index 32dee86..606db8a 100644
--- a/b.bat
+++ b/b.bat
@@ -1,3 +1,2 @@
-nasm -f coff timer.asm
-gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
-rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo
+nasm -f elf timer.asm
+gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo
\ No newline at end of file
diff --git a/bn.c b/bn.c
index 040ff1c..8175c33 100644
--- a/bn.c
+++ b/bn.c
@@ -99,7 +99,8 @@ void dump_timings(void)
    memset(&functime, 0, sizeof(functime));
    total = 0;
    for (x = 0; x < _itims; x++) {
-       total += timings[x].tot;
+       if (strcmp(timings[x].func, "_verify")) 
+          total += timings[x].tot;
        
        /* try to find this entry */
        for (y = 0; functime[y].func != NULL; y++) {
@@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    c->dp[digs-1]   = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK));
    
    /* clear unused */
-   for (ix = c->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
       c->dp[ix] = 0;
    }
   
@@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs)
    c->used = newused;
    
    /* now convert the array W downto what we need */
-   for (ix = digs+1; ix < (pa+pb+1); ix++) {
+   for (ix = digs+1; ix < newused; ix++) {
        W[ix]       += (W[ix-1] >> ((mp_word)DIGIT_BIT));
        c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
    }
    c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK));
    
-   for (ix = c->used; ix < oldused; ix++) {
+   for (; ix < oldused; ix++) {
       c->dp[ix] = 0;
    }
    mp_clamp(c);
@@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b)
    b->used = newused;
    
    /* now compute digits */
-   for (ix = 1; ix < (pa+pa+1); ix++) {
+   for (ix = 1; ix < newused; ix++) {
        /* double/add next digit */
        W[ix]       += W[ix] + W2[ix];
 
        W[ix]       = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT));
        b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK));
    }
-   b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK));
+   b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK));
    
    /* clear high */
-   for (ix = b->used; ix < olduse; ix++) {
+   for (; ix < olduse; ix++) {
        b->dp[ix] = 0;
    }
    
@@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c)
    }
    
    mp_clamp(&x0);
-   mp_clamp(&x1);
    mp_clamp(&y0);
-   mp_clamp(&y1);
    
    /* now calc the products x0y0 and x1y1 */
    if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1;             /* x0y0 = x0*y0 */
@@ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b)
    x1.used = a->used - B;
    
    mp_clamp(&x0);
-   mp_clamp(&x1);
    
    /* now calc the products x0*x0 and x1*x1 */
-   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                /* x0x0 = x0*x0 */
-   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                /* x1x1 = x1*x1 */
+   if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1;                  /* x0x0 = x0*x0 */
+   if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1;                  /* x1x1 = x1*x1 */
 
    /* now calc x1-x0 and y1-y0 */
    if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1;               /* t1 = x1 - x0 */
-   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                  /* t1 = (x1 - x0) * (y1 - y0) */
+   if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1;                    /* t1 = (x1 - x0) * (y1 - y0) */
 
    /* add x0y0 */
    if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1;           /* t2 = x0y0 + x1y1 */
@@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b)
    VERIFY(a);
    VERIFY(b);
    
-   mp_set(a, 1);
-   if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) {
+   if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) {
       DECFUNC();
       return res;
    }
@@ -2876,7 +2873,6 @@ __T:  mp_clear(&t);
    return res;
 }   
 
-
 /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */
 static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
 {
@@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
    mp_digit ui;
    mp_word  W[512];
    
+   REGFUNC("fast_mp_montgomery_reduce");
+   VERIFY(a);
+   VERIFY(m);
+   
    /* get old used count */
    olduse = a->used;
    
    /* grow a as required */
-   if (a->alloc < m->used*2+1) {
-      if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) {
+   if (a->alloc < m->used+1) {
+      if ((res = mp_grow(a, m->used+1)) != MP_OKAY) {
+         DECFUNC();
          return res;
       }
    }
    
-   /* copy and clear */
+   /* copy the digits of a */
    for (ix = 0; ix < a->used; ix++) {
        W[ix] = a->dp[ix];
    }
+   
+   /* zero the high words */
    for (; ix < m->used * 2 + 1; ix++) {
        W[ix] = 0;
    }
-   
+     
    for (ix = 0; ix < m->used; ix++) {
-       /* ui = ai * m' mod b */
+       /* ui = ai * m' mod b 
+        *
+        * We avoid a double precision multiplication (which isn't required)
+        * by casting the value down to a mp_digit.  Note this requires that W[ix-1] have
+        * the carry cleared (see after the inner loop)
+        */
        ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK;
        
-       /* a = a + ui * m * b^i */
+       /* a = a + ui * m * b^i 
+        *
+        * This is computed in place and on the fly.  The multiplication 
+        * by b^i is handled by offseting which columns the results 
+        * are added to.
+        *
+        * Note the comba method normally doesn't handle carries in the inner loop
+        * In this case we fix the carry from the previous column since the Montgomery
+        * reduction requires digits of the result (so far) [see above] to work.  This is 
+        * handled by fixing up one carry after the inner loop.  The carry fixups are done
+        * in order so after these loops the first m->used words of W[] have the carries
+        * fixed
+        */       
        { 
           register int      iy;
           register mp_digit *tmpx;
@@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
           tmpx = m->dp;
           _W   = W + ix;
           
+          /* inner loop */
           for (iy = 0; iy < m->used; iy++) {
               *_W++        += ((mp_word)ui) * ((mp_word)*tmpx++);
           }
-          
-          /* now fix carry for W[ix+1] */
-          W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
-          W[ix]   &= ((mp_word)MP_MASK);
        }
+
+       /* now fix carry for next digit, W[ix+1] */
+       W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT);
    }
    
    /* nox fix rest of carries */
-   for (; ix <= m->used * 2 + 1; ix++) {
+   for (++ix; ix <= m->used * 2 + 1; ix++) {
        W[ix]   += (W[ix-1] >> ((mp_word)DIGIT_BIT));
-       W[ix-1] &= ((mp_word)MP_MASK);
    }
    
-   /* copy out */
-
-   /* A = A/b^n */
+   /* copy out, A = A/b^n 
+    *
+    * The result is A/b^n but instead of converting from an array of mp_word
+    * to mp_digit than calling mp_rshd we just copy them in the right
+    * order 
+    */
    for (ix = 0; ix < m->used + 1; ix++) { 
-       a->dp[ix] = W[ix+m->used];
+       a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK);
    }
    
+   /* set the max used */
    a->used = m->used + 1;
 
-   /* zero oldused digits */  
+   /* zero oldused digits, if the input a was larger than 
+    * m->used+1 we'll have to clear the digits */  
    for (; ix < olduse; ix++) {
        a->dp[ix] = 0;
    }
@@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
    /* if A >= m then A = A - m */
    if (mp_cmp_mag(a, m) != MP_LT) {
       if ((res = s_mp_sub(a, m, a)) != MP_OKAY) {
+         DECFUNC();
          return res;
       }
-   }
+   }   
    
+   DECFUNC();
    return MP_OKAY;
 }
 
@@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp)
  */
 static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res;
+   mp_int M[256], res;
    mp_digit buf, mp;
    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
    
@@ -3048,12 +3074,14 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    
    /* find window size */
    x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
-   
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }
+
    /* init G array */
    for (x = 0; x < (1<<winsize); x++) {
       if ((err = mp_init_size(&M[x], 1)) != MP_OKAY) {
@@ -3072,15 +3100,14 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    
    /* setup result */
    if ((err = mp_init(&res)) != MP_OKAY) {
-      goto __M;
+      goto __RES;
    }
 
    /* now we need R mod m */
-   mp_set(&res, 1);           
-   if ((err = mp_lshd(&res, P->used)) != MP_OKAY) {
+   if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) {
       goto __RES;
    }
-   
+      
    /* res = R mod m */
    if ((err = mp_mod(&res, P, &res)) != MP_OKAY) {
       goto __RES;
@@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
     *
     * The first half of the table is not computed though accept for M[0] and M[1]
     */
-   mp_set(&M[0], 1);
    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
       goto __RES;
    }
@@ -3101,7 +3127,7 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
    if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) {
       goto __RES;
    }
-   
+      
    /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */
    if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) {
       goto __RES;
@@ -3236,10 +3262,9 @@ __M  :
    return err;
 }
 
-
 int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 {
-   mp_int M[64], res, mu;
+   mp_int M[256], res, mu;
    mp_digit buf;
    int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
    
@@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
 
    /* find window size */
    x = mp_count_bits(X);
-        if (x <= 18)    { winsize = 2; }
-   else if (x <= 84)    { winsize = 3; }
-   else if (x <= 300)   { winsize = 4; }
-   else if (x <= 930)   { winsize = 5; }
-   else                 { winsize = 6; }
+        if (x <= 7)     { winsize = 2; }
+   else if (x <= 36)    { winsize = 3; }
+   else if (x <= 140)   { winsize = 4; }
+   else if (x <= 450)   { winsize = 5; }
+   else if (x <= 1303)  { winsize = 6; }
+   else if (x <= 3529)  { winsize = 7; }
+   else                 { winsize = 8; }
    
    /* init G array */
    for (x = 0; x < (1<<winsize); x++) {
@@ -3289,7 +3316,6 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y)
     *
     * The first half of the table is not computed though accept for M[0] and M[1]
     */
-   mp_set(&M[0], 1);
    if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) {
       goto __MU;
    }
@@ -3430,6 +3456,22 @@ __M  :
    return err;
 }
 
+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b)
+{
+   int res;
+   
+   mp_zero(a);
+   if ((res = mp_grow(a, b/DIGIT_BIT + 1)) != MP_OKAY) {
+      return res;
+   }
+   a->used = b/DIGIT_BIT + 1;
+   a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT);
+   
+   return MP_OKAY;
+}   
+   
+
 /* find the n'th root of an integer 
  *
  * Result found such that (c)^b <= a and (c+1)^b > a 
diff --git a/bn.h b/bn.h
index 6e7bc85..5f39cbd 100644
--- a/bn.h
+++ b/bn.h
@@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b);
 /* c = a mod 2^d */
 int mp_mod_2d(mp_int *a, int b, mp_int *c);
 
+/* computes a = 2^b */
+int mp_2expt(mp_int *a, int b);
+
 /* ---> Basic arithmetic <--- */
 
 /* b = -a */
diff --git a/bn.pdf b/bn.pdf
index f9c86f2..b8152e1 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index d2aab27..5c8b73e 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,620 +1,635 @@
-\documentclass{article}
-\begin{document}
-
-\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library}
-\author{Tom St Denis \\ tomstdenis@iahu.ca}
-\maketitle
-\newpage
-
-\section{Introduction}
-``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis
-of a public key cryptosystem.  LibTomMath is written entire in portable ISO C source code and designed to have an application
-interface much like that of MPI from Michael Fromberger.  
-
-LibTomMath was written from scratch by Tom St Denis but designed to be  drop in replacement for the MPI package.  The 
-algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's
-``The Art of Computer Programming''.  The library has been extensively optimized and should provide quite comparable 
-timings as compared to many free and commercial libraries.
-
-LibTomMath was designed with the following goals in mind:
-\begin{enumerate}
-\item Be a drop in replacement for MPI.
-\item Be much faster than MPI.
-\item Be written entirely in portable C.
-\end{enumerate}
-
-All three goals have been achieved.  Particularly the speed increase goal.  For example, a 512-bit modular exponentiation 
-is eight times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI.
-
-Being compatible with MPI means that applications that already use it can be ported fairly quickly.  Currently there are 
-a few differences but there are many similarities.  In fact the average MPI based application can be ported in under 15
-minutes.  
-
-Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to
-help debug and suggest optimizations.  They were both of great help!
-
-\section{Building Against LibTomMath}
-
-Building against LibTomMath is very simple because there is only one source file.  Simply add ``bn.c'' to your project and 
-copy both ``bn.c'' and ``bn.h'' into your project directory.  There is no configuration nor building required before hand.
-
-If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them 
-with references to LibTomMath.  For example, substitute 
-
-\begin{verbatim}
-#include "mpi.h"
-\end{verbatim}
-
-with 
-
-\begin{verbatim}
-#include "bn.h"
-\end{verbatim}
-
-Remove ``mpi.c'' from your project and replace it with ``bn.c''.
-
-\section{Programming with LibTomMath}
-
-\subsection{The mp\_int Structure}
-All multiple precision integers are stored in a structure called \textbf{mp\_int}.  A multiple precision integer is
-essentially an array of \textbf{mp\_digit}.  mp\_digit is defined at the top of bn.h.  Its type can be changed to suit
-a particular platform.  
-
-For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds 
-seven bits.  Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits.  
-By default a mp\_digit is a unsigned long and holds 28 bits.  
-
-The choice of digit is particular to the platform at hand and what available multipliers are provided.  For 
-MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal.  When 
-MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal.  By
-default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal.  
-
-This gives the library some flexibility.  For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier.  The 
-16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier.  In practice this library is not particularly
-designed for small devices like an i8051 due to the size.  It is possible to strip out functions which are not required 
-to drop the code size.  More realistically the library is well suited to 32 and 64-bit processors that have decent
-integer multipliers.  The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors.
-
-Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer.  The
-used member refers to how many digits are actually used in the representation of the integer.  The alloc member refers
-to how many digits have been allocated off the heap.  There is also the $\beta$ quantity which is equal to $2^W$ where 
-$W$ is the number of bits in a digit (default is 28).  
-
-\subsection{Calling Functions}
-Most functions expect pointers to mp\_int's as parameters.   To save on memory usage it is possible to have source
-variables as destinations.  For example:
-\begin{verbatim}
-   mp_add(&x, &y, &x);           /* x = x + y */
-   mp_mul(&x, &z, &x);           /* x = x * z */
-   mp_div_2(&x, &x);             /* x = x / 2 */
-\end{verbatim}
-
-\section{Quick Overview}
-
-\subsection{Basic Functionality}
-Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired.  A 
-function will return \textbf{MP\_OKAY} if the function was successful.  A function will return \textbf{MP\_MEM} if
-it ran out of memory and \textbf{MP\_VAL} if the input was invalid.  
-
-Before an mp\_int can be used it must be initialized with 
-
-\begin{verbatim}
-int mp_init(mp_int *a);
-\end{verbatim}
-
-For example, consider the following.
-
-\begin{verbatim}
-#include "bn.h"
-int main(void)
-{
-   mp_int num;
-   if (mp_init(&num) != MP_OKAY) {
-      printf("Error initializing a mp_int.\n");
-   }
-   return 0;
-}   
-\end{verbatim}
-
-A mp\_int can be freed from memory with
-
-\begin{verbatim}
-void mp_clear(mp_int *a);
-\end{verbatim}
-
-This will zero the memory and free the allocated data.  There are a set of trivial functions to manipulate the 
-value of an mp\_int.  
-
-\begin{verbatim}
-/* set to zero */
-void mp_zero(mp_int *a);
-
-/* set to a digit */
-void mp_set(mp_int *a, mp_digit b);
-
-/* set a 32-bit const */
-int mp_set_int(mp_int *a, unsigned long b);
-
-/* init to a given number of digits */
-int mp_init_size(mp_int *a, int size);
-
-/* copy, b = a */
-int mp_copy(mp_int *a, mp_int *b);
-
-/* inits and copies, a = b */
-int mp_init_copy(mp_int *a, mp_int *b);
-\end{verbatim}
-
-The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive.  The \textbf{mp\_set} function 
-will zero the integer and set the first digit to a value specified.  The \textbf{mp\_set\_int} function will zero the 
-integer and set the first 32-bits to a given value.  It is important to note that using mp\_set can have unintended 
-side effects when either the  MP\_8BIT or MP\_16BIT defines are enabled.  By default the library will accept the 
-ranges of values MPI will (and more).
-
-The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value.  The 
-allocated digits are zero'ed by default but not marked as used.  The \textbf{mp\_copy} function will copy the digits
-(and sign) of the first parameter into the integer specified by the second parameter.  The \textbf{mp\_init\_copy} will
-initialize the first integer specified and copy the second one into it.  Note that the order is reversed from that of
-mp\_copy.  This odd ``bug'' was kept to maintain compatibility with MPI.
-
-\subsection{Digit Manipulations}
-
-There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers
-of two.  
-
-\begin{verbatim}
-/* right shift by "b" digits */
-void mp_rshd(mp_int *a, int b);
-
-/* left shift by "b" digits */
-int mp_lshd(mp_int *a, int b);
-
-/* c = a / 2^b */
-int mp_div_2d(mp_int *a, int b, mp_int *c);
-
-/* b = a/2 */
-int mp_div_2(mp_int *a, mp_int *b);
-
-/* c = a * 2^b */
-int mp_mul_2d(mp_int *a, int b, mp_int *c);
-
-/* b = a*2 */
-int mp_mul_2(mp_int *a, mp_int *b);
-
-/* c = a mod 2^d */
-int mp_mod_2d(mp_int *a, int b, mp_int *c);
-\end{verbatim}
-
-\subsection{Basic Arithmetic}
-
-Next are the class of functions which provide basic arithmetic.
-
-\begin{verbatim}
-/* b = -a */
-int mp_neg(mp_int *a, mp_int *b);
-
-/* b = |a| */
-int mp_abs(mp_int *a, mp_int *b);
-
-/* compare a to b */
-int mp_cmp(mp_int *a, mp_int *b);
-
-/* compare |a| to |b| */
-int mp_cmp_mag(mp_int *a, mp_int *b);
-
-/* c = a + b */
-int mp_add(mp_int *a, mp_int *b, mp_int *c);
-
-/* c = a - b */
-int mp_sub(mp_int *a, mp_int *b, mp_int *c);
-
-/* c = a * b */
-int mp_mul(mp_int *a, mp_int *b, mp_int *c);
-
-/* b = a^2 */
-int mp_sqr(mp_int *a, mp_int *b);
-
-/* a/b => cb + d == a */
-int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-
-/* c = a mod b, 0 <= c < b  */
-int mp_mod(mp_int *a, mp_int *b, mp_int *c);
-\end{verbatim}
-
-\subsection{Single Digit Functions}
-
-\begin{verbatim}
-/* compare against a single digit */
-int mp_cmp_d(mp_int *a, mp_digit b);
-
-/* c = a + b */
-int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
-
-/* c = a - b */
-int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
-
-/* c = a * b */
-int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
-
-/* a/b => cb + d == a */
-int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
-
-/* c = a mod b, 0 <= c < b  */
-int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
-\end{verbatim}
-
-Note that care should be taken for the value of the digit passed.  By default, any 28-bit integer is a valid digit that can
-be passed into the function.  However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers 
-can be passed into it.
-
-\subsection{Modular Arithmetic}
-
-There are some trivial modular arithmetic functions.
-
-\begin{verbatim}
-/* d = a + b (mod c) */
-int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-
-/* d = a - b (mod c) */
-int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-
-/* d = a * b (mod c) */
-int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-
-/* c = a * a (mod b) */
-int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
-
-/* c = 1/a (mod b) */
-int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
-
-/* c = (a, b) */
-int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
-
-/* c = [a, b] or (a*b)/(a, b) */
-int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
-
-/* find the b'th root of a  */
-int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
-
-/* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
-int mp_jacobi(mp_int *a, mp_int *n, int *c);
-
-/* used to setup the Barrett reduction for a given modulus b */
-int mp_reduce_setup(mp_int *a, mp_int *b);
-
-/* Barrett Reduction, computes a (mod b) with a precomputed value c  
- *
- * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
- * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
- */
-int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
-
-/* setups the montgomery reduction */
-int mp_montgomery_setup(mp_int *a, mp_digit *mp);
-
-/* computes xR^-1 == x (mod N) via Montgomery Reduction */
-int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
-
-/* d = a^b (mod c) */
-int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-\end{verbatim}
-
-\subsection{Radix Conversions}
-To read or store integers in other formats there are the following functions.
-
-\begin{verbatim}
-int mp_unsigned_bin_size(mp_int *a);
-int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
-int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
-
-int mp_signed_bin_size(mp_int *a);
-int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
-int mp_to_signed_bin(mp_int *a, unsigned char *b);
-
-int mp_read_radix(mp_int *a, unsigned char *str, int radix);
-int mp_toradix(mp_int *a, unsigned char *str, int radix);
-int mp_radix_size(mp_int *a, int radix);
-\end{verbatim}
-
-The integers are stored in big endian format as most libraries (and MPI) expect.  The \textbf{mp\_read\_radix} and 
-\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix.  Valid values
-for the radix are between 2 and 64 (inclusively).  
-
-\section{Function Analysis}
-
-Throughout the function analysis the variable $N$ will denote the average size of an input to a function as measured 
-by the number of digits it has.  The variable $W$ will denote the number of bits per word and $c$ will denote a small
-constant amount of work.  The big-oh notation will be abused slightly to consider numbers that do not grow to infinity.
-That is we shall consider $O(N/2) \ne O(N)$ which is an abuse of the notation.
-
-\subsection{Digit Manipulation Functions}
-The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\_lshd} and \textbf{mp\_mul\_2} are all
-very simple functions to analyze.  
-
-\subsubsection{mp\_rshd(mp\_int *a, int b)}
-Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$.  The work is performed
-in-place which means the input and output are the same.  If the shift count $b$ is less than or equal to zero 
-the function returns without doing any work.  If the the shift count is larger than the number of digits in $a$ 
-then $a$ is simply zeroed without shifting digits.
-
-This function requires no additional memory and $O(N)$ time.
-
-\subsubsection{mp\_lshd(mp\_int *a, int b)}
-Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$.  The work
-is performed in-place which means the input and output are the same.  If the shift count $b$ is less than or equal 
-to zero the function returns success without doing any work.
-
-This function requires $O(b)$ additional digits of memory and $O(N)$ time.
-
-\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
-Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$.  The shifted number is stored
-in the $c$ parameter.  The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL).  
-If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success.  
-
-This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
-
-\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
-Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$.  The shifted number
-is placed in the $c$ parameter.  If the shift count $b$ is less than or equal to zero the function places $a$
-in $c$ and returns success.  
-
-This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
-
-\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
-Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$.  If the shift count $b$ is less than 
-or equal to zero the function places $a$ in $c$ and returns success.  
-
-This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
-
-\subsection{Basic Arithmetic}
-
-\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
-Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.
-
-This function requires no additional memory and $O(N)$ time.
-
-\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
-Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.  Note 
-that this comparison is unsigned which means it will report, for example, $-5 > 3$.  By comparison mp\_cmp will 
-report $-5 < 3$.
-
-This function requires no additional memory and $O(N)$ time.
-
-\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
-Computes $c = a + b$ using signed arithmetic.  Handles the sign of the numbers which means it will subtract as 
-required, e.g. $a + -b$ turns into $a - b$.
-
-This function requires no additional memory and $O(N)$ time.
-
-\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
-Computes $c = a - b$ using signed arithmetic.   Handles the sign of the numbers which means it will add as 
-required, e.g. $a - -b$ turns into $a + b$.
-
-This function requires no additional memory and $O(N)$ time.
-
-\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
-Computes $c = a \cdot b$ using signed arithmetic.  Handles the sign of the numbers correctly which means it will 
-correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
-
-For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used.  It
-requires no additional memory and $O(N^2)$ time.  The comba-baseline multiplier is only used if it can safely be used
-without losing carry digits.  The comba method is faster than the baseline method but cannot always be used which is why
-both are provided.  The code will automatically determine when it can be used.  If the digit count is higher
-than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and 
-$O(N^{lg(3)})$ time.  
-
-\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
-Computes $b = a^2$. 
-For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used.  It
-requires no additional memory and $O((N^2 + N)/2)$ time.  The comba-squaring method is used only if it can be safely used
-without losing carry digits.  After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately 
-$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
-
-\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
-Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$.  The division is signed which means the sign
-of the output is not always positive.  The sign of the remainder equals the sign of $a$ while the sign of the 
-quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$.  Both $c$ and $d$ can be 
-optionally passed as NULL if the value is not desired.  For example, if you want only the quotient of $x/y$ then 
-mp\_div(\&x, \&y, \&z, NULL) is acceptable.
-
-This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
-
-\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)}
-Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$.  That is a normal 
-division is performed and if the remainder is negative $b$ is added to it.  Since adding $b$ modulo $b$ is equivalent
-to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate.  The results are undefined 
-when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive. 
-
-This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
-
-\subsection{Number Theoretic Functions}
-
-\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
-These functions take the time of their host function plus the time it takes to perform a division.  For example, 
-mp\_addmod takes $O(N + 3 \cdot N^2)$ time.  Note that if you are performing many modular operations in a row with
-the same modulus you should consider Barrett reductions.  
-
-Also note that these functions use mp\_mod which means the result are guaranteed to be positive.
-
-\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
-This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
-$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
-
-\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
-Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
-or negative inputs.  
-
-Functions requires no additional memory and approximately $O(N \cdot log(N))$ time.
-
-\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
-Finds the least common multiple of both $a$ and $b$ and places the result in $c$.  Will work with either positive
-or negative inputs.  This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of 
-both.  
-
-Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
-
-\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
-Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
-Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
-requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
-
-If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
-that has a sign that agrees with the sign of $a$.
-
-\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
-Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
-by $c$.  Since the result of the Jacobi function $\left ( {a \over n} \right ) \in \lbrace -1, 0, 1 \rbrace$ it seemed
-natural to store the result in a simple C style \textbf{int}.  If $n$ is prime then the Jacobi function produces
-the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}.  This means if
-$n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if 
-it is not.
-
-\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
-Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
-exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications.  The value of $k$ is
-chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery 
-reductions are used to reduce the squared or multiplied temporary results modulo $c$.
-
-\subsection{Fast Modular Reductions}
-
-\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
-Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes 
-$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the 
-function mp\_reduce\_setup().
-
-The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
-have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial
-multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants.  Barrett reduction 
-can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications.  
-
-\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
-Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then
-$R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
-The value of $mp$ is precomputed with the function mp\_montgomery\_setup().
-
-The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine
-is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can 
-reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  
-
-Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply
-by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the
-exptmod function can be made to take advantage of this such that only one normalization at the end is required.  
-
-\section{Timing Analysis}
-\subsection{Observed Timings}
-A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The
-test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build
-options ``-O3 -fomit-frame-pointer -funroll-loops''}.    The multiplications and squarings were repeated 100,000 times 
-each while the modular exponentiation (exptmod) were performed 50 times each.  The ``inversions'' refers to multiplicative
-inversions modulo an odd number of a given size.  The RDTSC (Read Time Stamp Counter) instruction was used to measure the 
-time the entire iterations took and was divided by the number of iterations to get an average.  The following results 
-were observed.
-
-\begin{small}
-\begin{center}
-\begin{tabular}{c|c|c|c}
-\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
-\hline
-Inversion & 128 & 264,083  & 59,782   \\
-Inversion & 256 & 549,370  & 146,915   \\
-Inversion & 512 & 1,675,975  & 367,172   \\
-Inversion & 1024 & 5,237,957  & 1,054,158   \\
-Inversion & 2048 & 17,871,944  & 3,459,683   \\
-Inversion & 4096 & 66,610,468  & 11,834,556   \\
-\hline
-Multiply & 128 & 1,426   & 451     \\
-Multiply & 256 & 2,551   & 958     \\
-Multiply & 512 & 7,913   & 2,476     \\
-Multiply & 1024 & 28,496   & 7,927   \\
-Multiply & 2048 & 109,897   & 28,224     \\
-Multiply & 4096 & 469,970   & 101,171     \\
-\hline 
-Square & 128 & 1,319   & 511     \\
-Square & 256 & 1,776   & 947     \\
-Square & 512 & 5,399  & 2,153    \\
-Square & 1024 & 18,991  & 5,733     \\
-Square & 2048 & 72,126  & 17,621    \\
-Square & 4096 & 306,269  & 67,576   \\
-\hline 
-Exptmod & 512 & 32,021,586  & 3,118,435 \\
-Exptmod & 768 & 97,595,492  & 8,493,633 \\
-Exptmod & 1024 & 223,302,532  & 17,715,899     \\
-Exptmod & 2048 & 1,682,223,369   & 114,936,361      \\
-Exptmod & 2560 & 3,268,615,571   & 229,402,426       \\
-Exptmod & 3072 & 5,597,240,141   & 367,403,840      \\
-Exptmod & 4096 & 13,347,270,891   & 779,058,433      
-
-\end{tabular}
-\end{center}
-\end{small}
-
-Note that the figures do fluctuate but their magnitudes are relatively intact.  The purpose of the chart is not to
-get an exact timing but to compare the two libraries.  For example, in all of the tests the exact time for a 512-bit
-squaring operation was not the same.  The observed times were all approximately 2,500 cycles, more importantly they
-were always faster than the timings observed with MPI by about the same magnitude.  
-
-\subsection{Digit Size}
-The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI 
-defualt of 16.  This means in many of the algorithms the savings can be considerable.  Consider a baseline multiplier 
-with a 1024-bit input.  With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits.
-A savings of $64^2 - 37^2 = 2727$ single precision multiplications.  
-
-\subsection{Multiplication Algorithms}
-For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI.  There are two variants 
-of the baseline multiplier.  The normal and the fast variants.  The normal baseline multiplier is the exact same as the
-algorithm from MPI.  The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less
-than or equal to $2^{w}/\beta^2$.  Where $w$ is the number of bits in a \textbf{mp\_word}.  By default a mp\_word is
-64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits.
-
-The fast baseline multiplier is optimized by removing the carry operations from the inner loop.  This is often referred
-to as the ``comba'' method since it computes the products a columns first then figures out the carries.  This has the
-effect of making a very simple and paralizable inner loop.
-
-For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is 
-used.  This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large
-inputs this method is faster.  The Karatsuba implementation is recursive which means for extremely large inputs they
-will benefit from the algorithm.
-
-MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop.  As a result even at
-smaller numbers (below the Karatsuba cutoff) the LibTomMath multipliers are faster.
-
-\subsection{Squaring Algorithms}
-
-Similar to the multiplication algorithms there are two baseline squaring algorithms.  Both have an asymptotic running
-time of $O((t^2 + t)/2)$.  The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring
-algorithm.  The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default 
-covers numbers upto $3584$ bits.  
-
-There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large
-inputs.
-
-MPI only implements the slower baseline squaring algorithm.  As a result LibTomMath is considerably faster at squaring
-than MPI is.
-
-\subsection{Exponentiation Algorithms}
-
-LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm.  For a given exponent size $L$ an
-appropriate window size $k$ is chosen.  There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular
-multiplications.   The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base 
-$b$.  Then the multiplications are grouped in windows of $k$ bits.  The sliding window technique has the benefit 
-that it can skip multiplications if there are zero bits following or preceding a window.  Consider the exponent 
-$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication
-of $g(3)$, four squarings and and a multiplication by $g(1)$.  In total there are 8 squarings and 3 multiplications.  
-
-MPI uses a binary square-multiply method.  For the same exponent $e$ it would have had 8 squarings and 5 multiplications.  
-There is a precomputation phase for the method LibTomMath uses but it generally cuts down considerably on the number
-of multiplications.  Consider a 512-bit exponent.  The worst case for the LibTomMath method results in 512 squarings and 
-124 multiplications.  The MPI method would have 512 squarings and 512 multiplications.  Randomly every $2k$ bits another 
-multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides.
-
-Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given.  
-However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction
-do not have to give full precision.  As a result the reduction step is much faster and just as accurate.  The LibTomMath code
-will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used.  The
-faster multipliers have also been optimized into the two variants (baseline and comba baseline).
-
-As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI.  
-
-
-
-\end{document}
\ No newline at end of file
+\documentclass{article}
+\begin{document}
+
+\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library}
+\author{Tom St Denis \\ tomstdenis@iahu.ca}
+\maketitle
+\newpage
+
+\section{Introduction}
+``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis
+of a public key cryptosystem.  LibTomMath is written entire in portable ISO C source code and designed to have an application
+interface much like that of MPI from Michael Fromberger.  
+
+LibTomMath was written from scratch by Tom St Denis but designed to be  drop in replacement for the MPI package.  The 
+algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's
+``The Art of Computer Programming''.  The library has been extensively optimized and should provide quite comparable 
+timings as compared to many free and commercial libraries.
+
+LibTomMath was designed with the following goals in mind:
+\begin{enumerate}
+\item Be a drop in replacement for MPI.
+\item Be much faster than MPI.
+\item Be written entirely in portable C.
+\end{enumerate}
+
+All three goals have been achieved.  Particularly the speed increase goal.  For example, a 512-bit modular exponentiation 
+is eight times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI.
+
+Being compatible with MPI means that applications that already use it can be ported fairly quickly.  Currently there are 
+a few differences but there are many similarities.  In fact the average MPI based application can be ported in under 15
+minutes.  
+
+Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to
+help debug and suggest optimizations.  They were both of great help!
+
+\section{Building Against LibTomMath}
+
+Building against LibTomMath is very simple because there is only one source file.  Simply add ``bn.c'' to your project and 
+copy both ``bn.c'' and ``bn.h'' into your project directory.  There is no configuration nor building required before hand.
+
+If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them 
+with references to LibTomMath.  For example, substitute 
+
+\begin{verbatim}
+#include "mpi.h"
+\end{verbatim}
+
+with 
+
+\begin{verbatim}
+#include "bn.h"
+\end{verbatim}
+
+Remove ``mpi.c'' from your project and replace it with ``bn.c''.
+
+\section{Programming with LibTomMath}
+
+\subsection{The mp\_int Structure}
+All multiple precision integers are stored in a structure called \textbf{mp\_int}.  A multiple precision integer is
+essentially an array of \textbf{mp\_digit}.  mp\_digit is defined at the top of bn.h.  Its type can be changed to suit
+a particular platform.  
+
+For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds 
+seven bits.  Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits.  
+By default a mp\_digit is a unsigned long and holds 28 bits.  
+
+The choice of digit is particular to the platform at hand and what available multipliers are provided.  For 
+MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal.  When 
+MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal.  By
+default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal.  
+
+This gives the library some flexibility.  For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier.  The 
+16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier.  In practice this library is not particularly
+designed for small devices like an i8051 due to the size.  It is possible to strip out functions which are not required 
+to drop the code size.  More realistically the library is well suited to 32 and 64-bit processors that have decent
+integer multipliers.  The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors.
+
+Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer.  The
+used member refers to how many digits are actually used in the representation of the integer.  The alloc member refers
+to how many digits have been allocated off the heap.  There is also the $\beta$ quantity which is equal to $2^W$ where 
+$W$ is the number of bits in a digit (default is 28).  
+
+\subsection{Calling Functions}
+Most functions expect pointers to mp\_int's as parameters.   To save on memory usage it is possible to have source
+variables as destinations.  For example:
+\begin{verbatim}
+   mp_add(&x, &y, &x);           /* x = x + y */
+   mp_mul(&x, &z, &x);           /* x = x * z */
+   mp_div_2(&x, &x);             /* x = x / 2 */
+\end{verbatim}
+
+\section{Quick Overview}
+
+\subsection{Basic Functionality}
+Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired.  A 
+function will return \textbf{MP\_OKAY} if the function was successful.  A function will return \textbf{MP\_MEM} if
+it ran out of memory and \textbf{MP\_VAL} if the input was invalid.  
+
+Before an mp\_int can be used it must be initialized with 
+
+\begin{verbatim}
+int mp_init(mp_int *a);
+\end{verbatim}
+
+For example, consider the following.
+
+\begin{verbatim}
+#include "bn.h"
+int main(void)
+{
+   mp_int num;
+   if (mp_init(&num) != MP_OKAY) {
+      printf("Error initializing a mp_int.\n");
+   }
+   return 0;
+}   
+\end{verbatim}
+
+A mp\_int can be freed from memory with
+
+\begin{verbatim}
+void mp_clear(mp_int *a);
+\end{verbatim}
+
+This will zero the memory and free the allocated data.  There are a set of trivial functions to manipulate the 
+value of an mp\_int.  
+
+\begin{verbatim}
+/* set to zero */
+void mp_zero(mp_int *a);
+
+/* set to a digit */
+void mp_set(mp_int *a, mp_digit b);
+
+/* set a 32-bit const */
+int mp_set_int(mp_int *a, unsigned long b);
+
+/* init to a given number of digits */
+int mp_init_size(mp_int *a, int size);
+
+/* copy, b = a */
+int mp_copy(mp_int *a, mp_int *b);
+
+/* inits and copies, a = b */
+int mp_init_copy(mp_int *a, mp_int *b);
+\end{verbatim}
+
+The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive.  The \textbf{mp\_set} function 
+will zero the integer and set the first digit to a value specified.  The \textbf{mp\_set\_int} function will zero the 
+integer and set the first 32-bits to a given value.  It is important to note that using mp\_set can have unintended 
+side effects when either the  MP\_8BIT or MP\_16BIT defines are enabled.  By default the library will accept the 
+ranges of values MPI will (and more).
+
+The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value.  The 
+allocated digits are zero'ed by default but not marked as used.  The \textbf{mp\_copy} function will copy the digits
+(and sign) of the first parameter into the integer specified by the second parameter.  The \textbf{mp\_init\_copy} will
+initialize the first integer specified and copy the second one into it.  Note that the order is reversed from that of
+mp\_copy.  This odd ``bug'' was kept to maintain compatibility with MPI.
+
+\subsection{Digit Manipulations}
+
+There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers
+of two.  
+
+\begin{verbatim}
+/* right shift by "b" digits */
+void mp_rshd(mp_int *a, int b);
+
+/* left shift by "b" digits */
+int mp_lshd(mp_int *a, int b);
+
+/* c = a / 2^b */
+int mp_div_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a/2 */
+int mp_div_2(mp_int *a, mp_int *b);
+
+/* c = a * 2^b */
+int mp_mul_2d(mp_int *a, int b, mp_int *c);
+
+/* b = a*2 */
+int mp_mul_2(mp_int *a, mp_int *b);
+
+/* c = a mod 2^d */
+int mp_mod_2d(mp_int *a, int b, mp_int *c);
+\end{verbatim}
+
+\subsection{Basic Arithmetic}
+
+Next are the class of functions which provide basic arithmetic.
+
+\begin{verbatim}
+/* b = -a */
+int mp_neg(mp_int *a, mp_int *b);
+
+/* b = |a| */
+int mp_abs(mp_int *a, mp_int *b);
+
+/* compare a to b */
+int mp_cmp(mp_int *a, mp_int *b);
+
+/* compare |a| to |b| */
+int mp_cmp_mag(mp_int *a, mp_int *b);
+
+/* c = a + b */
+int mp_add(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a - b */
+int mp_sub(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = a * b */
+int mp_mul(mp_int *a, mp_int *b, mp_int *c);
+
+/* b = a^2 */
+int mp_sqr(mp_int *a, mp_int *b);
+
+/* a/b => cb + d == a */
+int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a mod b, 0 <= c < b  */
+int mp_mod(mp_int *a, mp_int *b, mp_int *c);
+\end{verbatim}
+
+\subsection{Single Digit Functions}
+
+\begin{verbatim}
+/* compare against a single digit */
+int mp_cmp_d(mp_int *a, mp_digit b);
+
+/* c = a + b */
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a - b */
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* c = a * b */
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+
+/* a/b => cb + d == a */
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+
+/* c = a mod b, 0 <= c < b  */
+int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+\end{verbatim}
+
+Note that care should be taken for the value of the digit passed.  By default, any 28-bit integer is a valid digit that can
+be passed into the function.  However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers 
+can be passed into it.
+
+\subsection{Modular Arithmetic}
+
+There are some trivial modular arithmetic functions.
+
+\begin{verbatim}
+/* d = a + b (mod c) */
+int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a - b (mod c) */
+int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* d = a * b (mod c) */
+int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+
+/* c = a * a (mod b) */
+int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = 1/a (mod b) */
+int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = (a, b) */
+int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
+
+/* c = [a, b] or (a*b)/(a, b) */
+int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
+
+/* find the b'th root of a  */
+int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
+
+/* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
+int mp_jacobi(mp_int *a, mp_int *n, int *c);
+
+/* used to setup the Barrett reduction for a given modulus b */
+int mp_reduce_setup(mp_int *a, mp_int *b);
+
+/* Barrett Reduction, computes a (mod b) with a precomputed value c  
+ *
+ * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely
+ * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
+ */
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+
+/* setups the montgomery reduction */
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+
+/* computes xR^-1 == x (mod N) via Montgomery Reduction */
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+
+/* d = a^b (mod c) */
+int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
+\end{verbatim}
+
+\subsection{Radix Conversions}
+To read or store integers in other formats there are the following functions.
+
+\begin{verbatim}
+int mp_unsigned_bin_size(mp_int *a);
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+
+int mp_read_radix(mp_int *a, unsigned char *str, int radix);
+int mp_toradix(mp_int *a, unsigned char *str, int radix);
+int mp_radix_size(mp_int *a, int radix);
+\end{verbatim}
+
+The integers are stored in big endian format as most libraries (and MPI) expect.  The \textbf{mp\_read\_radix} and 
+\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix.  Valid values
+for the radix are between 2 and 64 (inclusively).  
+
+\section{Function Analysis}
+
+Throughout the function analysis the variable $N$ will denote the average size of an input to a function as measured 
+by the number of digits it has.  The variable $W$ will denote the number of bits per word and $c$ will denote a small
+constant amount of work.  The big-oh notation will be abused slightly to consider numbers that do not grow to infinity.
+That is we shall consider $O(N/2) \ne O(N)$ which is an abuse of the notation.
+
+\subsection{Digit Manipulation Functions}
+The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\_lshd} and \textbf{mp\_mul\_2} are all
+very simple functions to analyze.  
+
+\subsubsection{mp\_rshd(mp\_int *a, int b)}
+Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$.  The work is performed
+in-place which means the input and output are the same.  If the shift count $b$ is less than or equal to zero 
+the function returns without doing any work.  If the the shift count is larger than the number of digits in $a$ 
+then $a$ is simply zeroed without shifting digits.
+
+This function requires no additional memory and $O(N)$ time.
+
+\subsubsection{mp\_lshd(mp\_int *a, int b)}
+Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$.  The work
+is performed in-place which means the input and output are the same.  If the shift count $b$ is less than or equal 
+to zero the function returns success without doing any work.
+
+This function requires $O(b)$ additional digits of memory and $O(N)$ time.
+
+\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)}
+Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$.  The shifted number is stored
+in the $c$ parameter.  The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL).  
+If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success.  
+
+This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time.
+
+\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)}
+Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$.  The shifted number
+is placed in the $c$ parameter.  If the shift count $b$ is less than or equal to zero the function places $a$
+in $c$ and returns success.  
+
+This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
+
+\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)}
+Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$.  If the shift count $b$ is less than 
+or equal to zero the function places $a$ in $c$ and returns success.  
+
+This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time.
+
+\subsection{Basic Arithmetic}
+
+\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)}
+Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.
+
+This function requires no additional memory and $O(N)$ time.
+
+\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)}
+Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$.  Note 
+that this comparison is unsigned which means it will report, for example, $-5 > 3$.  By comparison mp\_cmp will 
+report $-5 < 3$.
+
+This function requires no additional memory and $O(N)$ time.
+
+\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a + b$ using signed arithmetic.  Handles the sign of the numbers which means it will subtract as 
+required, e.g. $a + -b$ turns into $a - b$.
+
+This function requires no additional memory and $O(N)$ time.
+
+\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a - b$ using signed arithmetic.   Handles the sign of the numbers which means it will add as 
+required, e.g. $a - -b$ turns into $a + b$.
+
+This function requires no additional memory and $O(N)$ time.
+
+\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c = a \cdot b$ using signed arithmetic.  Handles the sign of the numbers correctly which means it will 
+correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$.
+
+For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used.  It
+requires no additional memory and $O(N^2)$ time.  The comba-baseline multiplier is only used if it can safely be used
+without losing carry digits.  The comba method is faster than the baseline method but cannot always be used which is why
+both are provided.  The code will automatically determine when it can be used.  If the digit count is higher
+than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and 
+$O(N^{lg(3)})$ time.  
+
+\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)}
+Computes $b = a^2$. 
+For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used.  It
+requires no additional memory and $O((N^2 + N)/2)$ time.  The comba-squaring method is used only if it can be safely used
+without losing carry digits.  After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately 
+$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time.
+
+\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
+Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$.  The division is signed which means the sign
+of the output is not always positive.  The sign of the remainder equals the sign of $a$ while the sign of the 
+quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$.  Both $c$ and $d$ can be 
+optionally passed as NULL if the value is not desired.  For example, if you want only the quotient of $x/y$ then 
+mp\_div(\&x, \&y, \&z, NULL) is acceptable.
+
+This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
+
+\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$.  That is a normal 
+division is performed and if the remainder is negative $b$ is added to it.  Since adding $b$ modulo $b$ is equivalent
+to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate.  The results are undefined 
+when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive. 
+
+This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time.
+
+\subsection{Number Theoretic Functions}
+
+\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod}
+These functions take the time of their host function plus the time it takes to perform a division.  For example, 
+mp\_addmod takes $O(N + 3 \cdot N^2)$ time.  Note that if you are performing many modular operations in a row with
+the same modulus you should consider Barrett reductions.  
+
+Also note that these functions use mp\_mod which means the result are guaranteed to be positive.
+
+\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)}
+This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$.  When
+$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast.  
+
+\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)}
+Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$.  Will work with either positive
+or negative inputs.  
+
+Functions requires no additional memory and approximately $O(N \cdot log(N))$ time.
+
+\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)}
+Finds the least common multiple of both $a$ and $b$ and places the result in $c$.  Will work with either positive
+or negative inputs.  This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of 
+both.  
+
+Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time.
+
+\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)}
+Finds the $b$'th root of $a$ and stores it in $b$.  The roots are found such that $\vert c \vert^b \le \vert a \vert$.  
+Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result.  Each iteration
+requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$.
+
+If the input $a$ is negative and $b$ is even the function returns an error.  Otherwise the function will return a root
+that has a sign that agrees with the sign of $a$.
+
+\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)}
+Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed
+by $c$.  Since the result of the Jacobi function $\left ( {a \over n} \right ) \in \lbrace -1, 0, 1 \rbrace$ it seemed
+natural to store the result in a simple C style \textbf{int}.  If $n$ is prime then the Jacobi function produces
+the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}.  This means if
+$n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if 
+it is not.
+
+\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)}
+Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm.  For an $\alpha$-bit
+exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications.  The value of $k$ is
+chosen to minimize the number of multiplications required for a given value of $\alpha$.  Barrett or Montgomery 
+reductions are used to reduce the squared or multiplied temporary results modulo $c$.
+
+\subsection{Fast Modular Reductions}
+
+\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)}
+Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$.  In essence it computes 
+$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$.  The value of $c$ is precomputed with the 
+function mp\_reduce\_setup().  The modulus $b$ must be larger than zero.
+
+The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs
+have the number of single precision multipliers (\textit{provided they have the same size digits}).  The partial
+multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants.  Barrett reduction 
+can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications.  
+
+\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)}
+Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$.  If $b$ is some $n-$digit modulus then
+$R = \beta^{n+1}$.  The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$.
+The value of $mp$ is precomputed with the function mp\_montgomery\_setup().  The modulus $b$ must be odd and larger
+than zero.  
+
+The Montgomery reduction comes in two variants.  A standard baseline and a fast comba method.  The baseline routine
+is in fact slower than the Barrett reductions, however, the comba routine is much faster.  Montomgery reduction can 
+reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications.  Compared
+to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$.  
+
+Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$.  You have to multiply
+by $R$ modulo $b$ to get the real result.  At first that may not seem like such a worthwhile routine, however, the
+exptmod function can be made to take advantage of this such that only one normalization at the end is required.
+
+This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then
+$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$.  The next useful observation is 
+that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with
+a single reduction.  Now a series of complicated modular operations can be optimized if all the variables are initially
+multiplied by $R$ then the final result normalized by performing an extra reduction.
+
+If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$.
+Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction.  This means that
+two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable.  
+
+A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which
+requires no single precision multiplications.  
+
+\section{Timing Analysis}
+\subsection{Observed Timings}
+A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification).  The
+test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build
+options ``-O3 -fomit-frame-pointer -funroll-loops''}.    The multiplications and squarings were repeated 100,000 times 
+each while the modular exponentiation (exptmod) were performed 50 times each.  The ``inversions'' refers to multiplicative
+inversions modulo an odd number of a given size.  The RDTSC (Read Time Stamp Counter) instruction was used to measure the 
+time the entire iterations took and was divided by the number of iterations to get an average.  The following results 
+were observed.
+
+\begin{small}
+\begin{center}
+\begin{tabular}{c|c|c|c}
+\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\
+\hline
+Inversion & 128 & 264,083  & 59,782   \\
+Inversion & 256 & 549,370  & 146,915   \\
+Inversion & 512 & 1,675,975  & 367,172   \\
+Inversion & 1024 & 5,237,957  & 1,054,158   \\
+Inversion & 2048 & 17,871,944  & 3,459,683   \\
+Inversion & 4096 & 66,610,468  & 11,834,556   \\
+\hline
+Multiply & 128 & 1,426   & 451     \\
+Multiply & 256 & 2,551   & 958     \\
+Multiply & 512 & 7,913   & 2,476     \\
+Multiply & 1024 & 28,496   & 7,927   \\
+Multiply & 2048 & 109,897   & 28,224     \\
+Multiply & 4096 & 469,970   & 101,171     \\
+\hline 
+Square & 128 & 1,319   & 511     \\
+Square & 256 & 1,776   & 947     \\
+Square & 512 & 5,399  & 2,153    \\
+Square & 1024 & 18,991  & 5,733     \\
+Square & 2048 & 72,126  & 17,621    \\
+Square & 4096 & 306,269  & 67,576   \\
+\hline 
+Exptmod & 512 & 32,021,586  & 3,118,435 \\
+Exptmod & 768 & 97,595,492  & 8,493,633 \\
+Exptmod & 1024 & 223,302,532  & 17,715,899     \\
+Exptmod & 2048 & 1,682,223,369   & 114,936,361      \\
+Exptmod & 2560 & 3,268,615,571   & 229,402,426       \\
+Exptmod & 3072 & 5,597,240,141   & 367,403,840      \\
+Exptmod & 4096 & 13,347,270,891   & 779,058,433      
+
+\end{tabular}
+\end{center}
+\end{small}
+
+Note that the figures do fluctuate but their magnitudes are relatively intact.  The purpose of the chart is not to
+get an exact timing but to compare the two libraries.  For example, in all of the tests the exact time for a 512-bit
+squaring operation was not the same.  The observed times were all approximately 2,500 cycles, more importantly they
+were always faster than the timings observed with MPI by about the same magnitude.  
+
+\subsection{Digit Size}
+The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI 
+defualt of 16.  This means in many of the algorithms the savings can be considerable.  Consider a baseline multiplier 
+with a 1024-bit input.  With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits.
+A savings of $64^2 - 37^2 = 2727$ single precision multiplications.  
+
+\subsection{Multiplication Algorithms}
+For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI.  There are two variants 
+of the baseline multiplier.  The normal and the fast variants.  The normal baseline multiplier is the exact same as the
+algorithm from MPI.  The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less
+than or equal to $2^{w}/\beta^2$.  Where $w$ is the number of bits in a \textbf{mp\_word}.  By default a mp\_word is
+64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits.
+
+The fast baseline multiplier is optimized by removing the carry operations from the inner loop.  This is often referred
+to as the ``comba'' method since it computes the products a columns first then figures out the carries.  This has the
+effect of making a very simple and paralizable inner loop.
+
+For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is 
+used.  This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large
+inputs this method is faster.  The Karatsuba implementation is recursive which means for extremely large inputs they
+will benefit from the algorithm.
+
+MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop.  As a result even at
+smaller numbers (below the Karatsuba cutoff) the LibTomMath multipliers are faster.
+
+\subsection{Squaring Algorithms}
+
+Similar to the multiplication algorithms there are two baseline squaring algorithms.  Both have an asymptotic running
+time of $O((t^2 + t)/2)$.  The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring
+algorithm.  The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default 
+covers numbers upto $3584$ bits.  
+
+There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large
+inputs.
+
+MPI only implements the slower baseline squaring algorithm.  As a result LibTomMath is considerably faster at squaring
+than MPI is.
+
+\subsection{Exponentiation Algorithms}
+
+LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm.  For a given exponent size $L$ an
+appropriate window size $k$ is chosen.  There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular
+multiplications.   The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base 
+$b$.  Then the multiplications are grouped in windows of $k$ bits.  The sliding window technique has the benefit 
+that it can skip multiplications if there are zero bits following or preceding a window.  Consider the exponent 
+$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication
+of $g(3)$, four squarings and and a multiplication by $g(1)$.  In total there are 8 squarings and 3 multiplications.  
+
+MPI uses a binary square-multiply method.  For the same exponent $e$ it would have had 8 squarings and 5 multiplications.  
+There is a precomputation phase for the method LibTomMath uses but it generally cuts down considerably on the number
+of multiplications.  Consider a 512-bit exponent.  The worst case for the LibTomMath method results in 512 squarings and 
+124 multiplications.  The MPI method would have 512 squarings and 512 multiplications.  Randomly every $2k$ bits another 
+multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides.
+
+Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given.  
+However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction
+do not have to give full precision.  As a result the reduction step is much faster and just as accurate.  The LibTomMath code
+will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used.  The
+faster multipliers have also been optimized into the two variants (baseline and comba baseline).
+
+As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI.  
+
+
+
+\end{document}
diff --git a/changes.txt b/changes.txt
index d302ee6..e78a2a3 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,70 +1,76 @@
-Jan 9th, 2003
-v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
-       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
-          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
-       
-Jan 6th, 2003
-v0.09  -- Updated the manual to reflect recent changes.  :-)
-       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
-       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
-
-Jan 2nd, 2003
-v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
-       -- Corrected a bunch of small "warnings"
-       -- Added more comments
-       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
-       -- Corrected some bugs where error messages were potentially ignored
-       -- add etc/pprime.c program which makes numbers which are provably prime.
-       
-Jan 1st, 2003
-v0.07  -- Removed alot of heap operations from core functions to speed them up
-       -- Added a root finding function [and mp_sqrt macro like from MPI]
-       -- Added more to manual 
-
-Dec 31st, 2002
-v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
-       -- Cleaned up the header a bit more
-       
-Dec 30th, 2002
-v0.05  -- Builds with MSVC out of the box
-       -- Fixed a bug in mp_invmod w.r.t. even moduli
-       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
-       -- Fixed up exptmod to use fewer multiplications
-       -- Fixed up mp_init_size to use only one heap operation
-          -- Note there is a slight "off-by-one" bug in the library somewhere
-             without the padding (see the source for comment) the library 
-             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
-             numbers which will always correct it since as the numbers grow the padding
-             will still be beyond the end of the number
-       -- Added more to the manual
-       
-Dec 29th, 2002
-v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
-       -- optimized invmod code
-       -- Fixed bug in mp_div
-       -- use exchange instead of copy for results
-       -- added a bit more to the manual
-       
-Dec 27th, 2002
-v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
-       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
-       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
-       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
-       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
-       -- mp_mul_d didn't preserve sign
-       -- Many many many many fixes
-       -- Works in LibTomCrypt now :-)
-       -- Added iterations to the timing demos... more accurate.
-       -- Tom needs a job.       
-
-Dec 26th, 2002
-v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
-       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
-       -- Sped up the fast [comba] multipliers more [yahoo!]
-
-Dec 25th,2002
-v0.01  -- Initial release.  Gimme a break.
-       -- Todo list, 
-           add details to manual [e.g. algorithms]
-           more comments in code
-           example programs
\ No newline at end of file
+Jan 15th, 2003
+v0.11  -- More subtle fixes
+       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
+       -- Sped up the montgomery reduction code quite a bit
+       -- fixed up demo so when building timing for the x86 it assumes ELF format now
+       
+Jan 9th, 2003
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
+       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
+          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
+       
+Jan 6th, 2003
+v0.09  -- Updated the manual to reflect recent changes.  :-)
+       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
+       -- Added a Mersenne prime finder demo in ./etc/mersenne.c
+
+Jan 2nd, 2003
+v0.08  -- Sped up the multipliers by moving the inner loop variables into a smaller scope
+       -- Corrected a bunch of small "warnings"
+       -- Added more comments
+       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
+       -- Corrected some bugs where error messages were potentially ignored
+       -- add etc/pprime.c program which makes numbers which are provably prime.
+       
+Jan 1st, 2003
+v0.07  -- Removed alot of heap operations from core functions to speed them up
+       -- Added a root finding function [and mp_sqrt macro like from MPI]
+       -- Added more to manual 
+
+Dec 31st, 2002
+v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
+       -- Cleaned up the header a bit more
+       
+Dec 30th, 2002
+v0.05  -- Builds with MSVC out of the box
+       -- Fixed a bug in mp_invmod w.r.t. even moduli
+       -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays
+       -- Fixed up exptmod to use fewer multiplications
+       -- Fixed up mp_init_size to use only one heap operation
+          -- Note there is a slight "off-by-one" bug in the library somewhere
+             without the padding (see the source for comment) the library 
+             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
+             numbers which will always correct it since as the numbers grow the padding
+             will still be beyond the end of the number
+       -- Added more to the manual
+       
+Dec 29th, 2002
+v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
+       -- optimized invmod code
+       -- Fixed bug in mp_div
+       -- use exchange instead of copy for results
+       -- added a bit more to the manual
+       
+Dec 27th, 2002
+v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
+       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
+       -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly
+       -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work
+       -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs
+       -- mp_mul_d didn't preserve sign
+       -- Many many many many fixes
+       -- Works in LibTomCrypt now :-)
+       -- Added iterations to the timing demos... more accurate.
+       -- Tom needs a job.       
+
+Dec 26th, 2002
+v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
+       -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing.
+       -- Sped up the fast [comba] multipliers more [yahoo!]
+
+Dec 25th,2002
+v0.01  -- Initial release.  Gimme a break.
+       -- Todo list, 
+           add details to manual [e.g. algorithms]
+           more comments in code
+           example programs
diff --git a/demo.c b/demo.c
index ab92707..f482120 100644
--- a/demo.c
+++ b/demo.c
@@ -19,8 +19,10 @@
 
 #ifdef TIMER_X86
 #define TIMER
-extern ulong64 rdtsc(void);
-extern void reset(void);
+extern ulong64 _rdtsc(void);
+extern void _reset(void);
+ulong64 rdtsc(void) { return _rdtsc(); }
+void reset(void) { _reset(); }
 #endif
 
 #ifdef TIMER
@@ -85,7 +87,6 @@ int main(void)
    mp_int a, b, c, d, e, f;
    unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n;
    int rr;
-   mp_digit tom;
    
 #ifdef TIMER
    int n;
@@ -99,42 +100,33 @@ int main(void)
    mp_init(&e);
    mp_init(&f);
    
-   mp_read_radix(&a, "59994534535345535344389423", 10);
-   mp_read_radix(&b, "49993453555234234565675534", 10);
-   mp_read_radix(&c, "62398923474472948723847281", 10);
-    
-   mp_mulmod(&a, &b, &c, &f);
-   
-   /* setup mont */
-   mp_montgomery_setup(&c, &tom);
-   mp_mul(&a, &b, &a);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_montgomery_reduce(&a, &c, tom);
-   mp_lshd(&a, c.used*2);
-   mp_mod(&a, &c, &a);
-   
-   mp_toradix(&a, cmd, 10);
-   printf("%s\n\n", cmd);
-   mp_toradix(&f, cmd, 10);
-   printf("%s\n", cmd);
-   
-/*   return 0; */
-   
-   
-   mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64);
-   mp_reduce_setup(&b, &a);
-   printf("\n\n----\n\n");
-   mp_toradix(&b, buf, 10);
-   printf("b == %s\n\n\n", buf);
-
-   mp_read_radix(&b, "4982748972349724892742", 10);
-   mp_sub_d(&a, 1, &c);
-   mp_exptmod(&b, &c, &a, &d);
-   mp_toradix(&d, buf, 10);
-   printf("b^p-1 == %s\n", buf);
-   
+#ifdef DEBUG
+   mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10);
+   mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10);
+   mp_set(&c, 1);
+   reset_timings();
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   mp_exptmod(&c, &b, &a, &d);
+   dump_timings();
+   return 0;
+#endif   
       
 #ifdef TIMER      
+goto expt;
       mp_read_radix(&a, "340282366920938463463374607431768211455", 10);
       mp_read_radix(&b, "340282366920938463463574607431768211455", 10);
       while (a.used * DIGIT_BIT < 8192) {
@@ -182,7 +174,7 @@ int main(void)
       printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000));
       mp_copy(&b, &a);
    }
-
+expt:
    {
       char *primes[] = {
          "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203",
@@ -206,7 +198,7 @@ int main(void)
       mp_mod(&b, &c, &b);
       mp_set(&c, 3);
       reset();
-      for (rr = 0; rr < 35; rr++) {
+      for (rr = 0; rr < 100; rr++) {
           mp_exptmod(&c, &b, &a, &d);
       }
       tt = rdtsc();
@@ -219,7 +211,7 @@ int main(void)
          draw(&d);
          exit(0);
       }
-      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35));
+      printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100));
    }
    }   
 
diff --git a/makefile b/makefile
index edaf773..7567d22 100644
--- a/makefile
+++ b/makefile
@@ -1,13 +1,13 @@
 CC = gcc
-CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops 
+CFLAGS  +=  -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.10
+VERSION=0.11
 
 default: test
 
 test: bn.o demo.o
 	$(CC) bn.o demo.o -o demo
-	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s
+	cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s
 
 # builds the x86 demo
 test86:
@@ -22,9 +22,9 @@ docs:	docdvi
 	rm -f bn.log bn.aux bn.dvi
 	
 clean:
-	rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s 
+	rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne
 
 zipup: clean docs
-	chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
+	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
 	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
-	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
\ No newline at end of file
+	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
diff --git a/timer.asm b/timer.asm
index 2393250..b317e3e 100644
--- a/timer.asm
+++ b/timer.asm
@@ -1,34 +1,34 @@
-; Simple RDTSC reader for NASM
-;
-; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever]
-;
-; Most *nix installs use elf so it would be "nasm -f elf timer.asm"
-;
-; Tom St Denis
-[bits 32]
-[section .data]
-timer dd 0, 0
-[section .text]
-
-[global _gettsc]
-_gettsc:
-   rdtsc
-   ret
-
-[global _rdtsc]
-_rdtsc:
-   rdtsc
-   sub eax,[timer]
-   sbb edx,[timer+4]
-   ret
-
-[global _reset]
-_reset:
-   push eax
-   push edx
-   rdtsc
-   mov [timer],eax 
-   mov [timer+4],edx
-   pop edx
-   pop eax
-   ret
\ No newline at end of file
+; Simple RDTSC reader for NASM
+;
+; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever]
+;
+; Most *nix installs use elf so it would be "nasm -f elf timer.asm"
+;
+; Tom St Denis
+[bits 32]
+[section .data]
+timer dd 0, 0
+[section .text]
+
+[global _gettsc]
+_gettsc:
+   rdtsc
+   ret
+
+[global _rdtsc]
+_rdtsc:
+   rdtsc
+   sub eax,[timer]
+   sbb edx,[timer+4]
+   ret
+
+[global _reset]
+_reset:
+   push eax
+   push edx
+   rdtsc
+   mov [timer],eax 
+   mov [timer+4],edx
+   pop edx
+   pop eax
+   ret