added libtommath-0.30

2004-04-11 20:46:22 +00:00 · 2004-04-11 20:46:22 +00:00 · 350578d400
commit 350578d400
parent 6c48a9b3a6
66 changed files with 7073 additions and 5858 deletions
--- a/bn.ilg
+++ b/bn.ilg
@ -1,6 +1,6 @@
 This is makeindex, version 2.14 [02-Oct-2002] (kpathsea + Thai support).
-Scanning input file bn.idx....done (57 entries accepted, 0 rejected).
-Sorting entries....done (342 comparisons).
-Generating output file bn.ind....done (60 lines written, 0 warnings).
+Scanning input file bn.idx....done (79 entries accepted, 0 rejected).
+Sorting entries....done (511 comparisons).
+Generating output file bn.ind....done (82 lines written, 0 warnings).
 Output written in bn.ind.
 Transcript written in bn.ilg.
--- a/bn.ind
+++ b/bn.ind
@ -1,60 +1,82 @@
 \begin{theindex}

-  \item mp\_add, \hyperpage{23}
-  \item mp\_and, \hyperpage{23}
+  \item mp\_add, \hyperpage{25}
+  \item mp\_add\_d, \hyperpage{48}
+  \item mp\_and, \hyperpage{25}
  \item mp\_clear, \hyperpage{7}
  \item mp\_clear\_multi, \hyperpage{8}
-  \item mp\_cmp, \hyperpage{18}
-  \item mp\_cmp\_d, \hyperpage{20}
-  \item mp\_cmp\_mag, \hyperpage{17}
-  \item mp\_div, \hyperpage{29}
-  \item mp\_div\_2, \hyperpage{21}
-  \item mp\_div\_2d, \hyperpage{22}
-  \item MP\_EQ, \hyperpage{17}
+  \item mp\_cmp, \hyperpage{20}
+  \item mp\_cmp\_d, \hyperpage{21}
+  \item mp\_cmp\_mag, \hyperpage{19}
+  \item mp\_div, \hyperpage{26}
+  \item mp\_div\_2, \hyperpage{22}
+  \item mp\_div\_2d, \hyperpage{24}
+  \item mp\_div\_d, \hyperpage{48}
+  \item mp\_dr\_reduce, \hyperpage{36}
+  \item mp\_dr\_setup, \hyperpage{36}
+  \item MP\_EQ, \hyperpage{18}
  \item mp\_error\_to\_string, \hyperpage{6}
-  \item mp\_expt\_d, \hyperpage{31}
-  \item mp\_exptmod, \hyperpage{31}
-  \item mp\_exteuclid, \hyperpage{39}
-  \item mp\_gcd, \hyperpage{39}
+  \item mp\_expt\_d, \hyperpage{39}
+  \item mp\_exptmod, \hyperpage{39}
+  \item mp\_exteuclid, \hyperpage{47}
+  \item mp\_gcd, \hyperpage{47}
+  \item mp\_get\_int, \hyperpage{16}
  \item mp\_grow, \hyperpage{12}
-  \item MP\_GT, \hyperpage{17}
+  \item MP\_GT, \hyperpage{18}
  \item mp\_init, \hyperpage{7}
  \item mp\_init\_copy, \hyperpage{9}
  \item mp\_init\_multi, \hyperpage{8}
+  \item mp\_init\_set, \hyperpage{17}
+  \item mp\_init\_set\_int, \hyperpage{17}
  \item mp\_init\_size, \hyperpage{10}
  \item mp\_int, \hyperpage{6}
-  \item mp\_invmod, \hyperpage{40}
-  \item mp\_jacobi, \hyperpage{40}
-  \item mp\_lcm, \hyperpage{39}
-  \item mp\_lshd, \hyperpage{23}
-  \item MP\_LT, \hyperpage{17}
+  \item mp\_invmod, \hyperpage{48}
+  \item mp\_jacobi, \hyperpage{48}
+  \item mp\_lcm, \hyperpage{47}
+  \item mp\_lshd, \hyperpage{24}
+  \item MP\_LT, \hyperpage{18}
  \item MP\_MEM, \hyperpage{5}
-  \item mp\_mul, \hyperpage{25}
-  \item mp\_mul\_2, \hyperpage{21}
-  \item mp\_mul\_2d, \hyperpage{22}
-  \item mp\_n\_root, \hyperpage{31}
-  \item mp\_neg, \hyperpage{24}
+  \item mp\_mod, \hyperpage{31}
+  \item mp\_mod\_d, \hyperpage{48}
+  \item mp\_montgomery\_calc\_normalization, \hyperpage{34}
+  \item mp\_montgomery\_reduce, \hyperpage{33}
+  \item mp\_montgomery\_setup, \hyperpage{33}
+  \item mp\_mul, \hyperpage{27}
+  \item mp\_mul\_2, \hyperpage{22}
+  \item mp\_mul\_2d, \hyperpage{24}
+  \item mp\_mul\_d, \hyperpage{48}
+  \item mp\_n\_root, \hyperpage{40}
+  \item mp\_neg, \hyperpage{25}
  \item MP\_NO, \hyperpage{5}
  \item MP\_OKAY, \hyperpage{5}
-  \item mp\_or, \hyperpage{23}
-  \item mp\_prime\_fermat, \hyperpage{33}
-  \item mp\_prime\_is\_divisible, \hyperpage{33}
-  \item mp\_prime\_is\_prime, \hyperpage{34}
-  \item mp\_prime\_miller\_rabin, \hyperpage{33}
-  \item mp\_prime\_next\_prime, \hyperpage{34}
-  \item mp\_prime\_rabin\_miller\_trials, \hyperpage{34}
-  \item mp\_prime\_random, \hyperpage{35}
-  \item mp\_radix\_size, \hyperpage{37}
-  \item mp\_read\_radix, \hyperpage{37}
-  \item mp\_rshd, \hyperpage{23}
+  \item mp\_or, \hyperpage{25}
+  \item mp\_prime\_fermat, \hyperpage{41}
+  \item mp\_prime\_is\_divisible, \hyperpage{41}
+  \item mp\_prime\_is\_prime, \hyperpage{42}
+  \item mp\_prime\_miller\_rabin, \hyperpage{41}
+  \item mp\_prime\_next\_prime, \hyperpage{42}
+  \item mp\_prime\_rabin\_miller\_trials, \hyperpage{42}
+  \item mp\_prime\_random, \hyperpage{43}
+  \item mp\_prime\_random\_ex, \hyperpage{43}
+  \item mp\_radix\_size, \hyperpage{45}
+  \item mp\_read\_radix, \hyperpage{45}
+  \item mp\_read\_unsigned\_bin, \hyperpage{46}
+  \item mp\_reduce, \hyperpage{32}
+  \item mp\_reduce\_2k, \hyperpage{37}
+  \item mp\_reduce\_2k\_setup, \hyperpage{37}
+  \item mp\_reduce\_setup, \hyperpage{32}
+  \item mp\_rshd, \hyperpage{24}
  \item mp\_set, \hyperpage{15}
  \item mp\_set\_int, \hyperpage{16}
  \item mp\_shrink, \hyperpage{11}
-  \item mp\_sqr, \hyperpage{25}
-  \item mp\_sub, \hyperpage{23}
-  \item mp\_toradix, \hyperpage{37}
+  \item mp\_sqr, \hyperpage{29}
+  \item mp\_sub, \hyperpage{25}
+  \item mp\_sub\_d, \hyperpage{48}
+  \item mp\_to\_unsigned\_bin, \hyperpage{46}
+  \item mp\_toradix, \hyperpage{45}
+  \item mp\_unsigned\_bin\_size, \hyperpage{46}
  \item MP\_VAL, \hyperpage{5}
-  \item mp\_xor, \hyperpage{23}
+  \item mp\_xor, \hyperpage{25}
  \item MP\_YES, \hyperpage{5}

 \end{theindex}
--- a/bn.pdf
+++ b/bn.pdf
--- a/bn.tex
+++ b/bn.tex
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{LibTomMath User Manual \\ v0.28}
+\title{LibTomMath User Manual \\ v0.30}
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 This text, the library and the accompanying textbook are all hereby placed in the public domain.  This book has been 
@ -87,7 +87,7 @@ release the textbook ``Implementing Multiple Precision Arithmetic'' has been pla
 release as well.  This textbook is meant to compliment the project by providing a more solid walkthrough of the development
 algorithms used in the library.

-Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger.} are in the 
+Since both\footnote{Note that the MPI files under mtest/ are copyrighted by Michael Fromberger.  They are not required to use LibTomMath.} are in the 
 public domain everyone is entitled to do with them as they see fit.

 \section{Building LibTomMath}
@ -114,7 +114,7 @@ nmake -f makefile.msvc
 This will build the library and archive the object files in ``tommath.lib''.  This has been tested with MSVC version 6.00
 with service pack 5.  

-Tbere is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile.  It requires Cygwin
+There is limited support for making a ``DLL'' in windows via the ``makefile.cygwin\_dll'' makefile.  It requires Cygwin
 to work with since it requires the auto-export/import functionality.  The resulting DLL and imprt library ``libtomcrypt.dll.a''
 can be used to link LibTomMath dynamically to any Windows program using Cygwin.

@ -385,7 +385,7 @@ To initialized and make a copy of an mp\_int the mp\_init\_copy() function has b
 int mp_init_copy (mp_int * a, mp_int * b);
 \end{alltt}

-This function will initialize ``a'' and make it a copy of ``b'' if all goes well.
+This function will initialize $a$ and make it a copy of $b$ if all goes well.

 \begin{small} \begin{alltt}
 int main(void)
@ -420,8 +420,8 @@ you override this behaviour.
 int mp_init_size (mp_int * a, int size);
 \end{alltt}

-The ``size'' parameter must be greater than zero.  If the function succeeds the mp\_int ``a'' will be initialized
-to have ``size'' digits (which are all initially zero).  
+The $size$ parameter must be greater than zero.  If the function succeeds the mp\_int $a$ will be initialized
+to have $size$ digits (which are all initially zero).  

 \begin{small} \begin{alltt}
 int main(void)
@ -453,7 +453,7 @@ digits can be removed to return memory to the heap with the mp\_shrink() functio
 int mp_shrink (mp_int * a);
 \end{alltt}

-This will remove excess digits of the mp\_int ``a''.  If the operation fails the mp\_int should be intact without the
+This will remove excess digits of the mp\_int $a$.  If the operation fails the mp\_int should be intact without the
 excess digits being removed.  Note that you can use a shrunk mp\_int in further computations, however, such operations
 will require heap operations which can be slow.  It is not ideal to shrink mp\_int variables that you will further
 modify in the system (unless you are seriously low on memory).  
@ -502,8 +502,8 @@ your desired size.
 int mp_grow (mp_int * a, int size);
 \end{alltt}

-This will grow the array of digits of ``a'' to ``size''.  If the \textit{alloc} parameter is already bigger than
-``size'' the function will not do anything.
+This will grow the array of digits of $a$ to $size$.  If the \textit{alloc} parameter is already bigger than
+$size$ the function will not do anything.

 \begin{small} \begin{alltt}
 int main(void)
@ -552,7 +552,7 @@ Setting a single digit can be accomplished with the following function.
 void mp_set (mp_int * a, mp_digit b);
 \end{alltt}

-This will zero the contents of ``a'' and make it represent an integer equal to the value of ``b''.  Note that this
+This will zero the contents of $a$ and make it represent an integer equal to the value of $b$.  Note that this
 function has a return type of \textbf{void}.  It cannot cause an error so it is safe to assume the function
 succeeded.

@ -578,20 +578,29 @@ int main(void)
 \}
 \end{alltt} \end{small}

-\subsection{Long Constant}
+\subsection{Long Constants}

-When you want to set a constant that is the size of an ISO C ``unsigned long'' and larger than a single
-digit the following function is provided.
+To set a constant that is the size of an ISO C ``unsigned long'' and larger than a single digit the following function 
+can be used.

 \index{mp\_set\_int}
 \begin{alltt}
 int mp_set_int (mp_int * a, unsigned long b);
 \end{alltt}

-This will assign the value of the 32-bit variable ``b'' to the mp\_int ``a''.  Unlike mp\_set() this function will always
+This will assign the value of the 32-bit variable $b$ to the mp\_int $a$.  Unlike mp\_set() this function will always
 accept a 32-bit input regardless of the size of a single digit.  However, since the value may span several digits 
 this function can fail if it runs out of heap memory.

+To get the ``unsigned long'' copy of an mp\_int the following function can be used.
+
+\index{mp\_get\_int}
+\begin{alltt}
+unsigned long mp_get_int (mp_int * a);
+\end{alltt}
+
+This will return the 32 least significant bits of the mp\_int $a$.  
+
 \begin{small} \begin{alltt}
 int main(void)
 \{
@ -610,6 +619,9 @@ int main(void)
             mp_error_to_string(result));
      return EXIT_FAILURE;
   \}
+
+   printf("number == \%lu", mp_get_int(&number));
+
   /* we're done with it. */ 
   mp_clear(&number);

@ -617,6 +629,58 @@ int main(void)
 \}
 \end{alltt} \end{small}

+This should output the following if the program succeeds.
+
+\begin{alltt}
+number == 654321
+\end{alltt}
+
+\subsection{Initialize and Setting Constants}
+To both initialize and set small constants the following two functions are available.
+\index{mp\_init\_set} \index{mp\_init\_set\_int}
+\begin{alltt}
+int mp_init_set (mp_int * a, mp_digit b);
+int mp_init_set_int (mp_int * a, unsigned long b);
+\end{alltt}
+
+Both functions work like the previous counterparts except they first mp\_init $a$ before setting the values.  
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int    result;
+
+   /* initialize and set a single digit */
+   if ((result = mp_init_set(&number1, 100)) != MP_OKAY) \{
+      printf("Error setting number1: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}             
+
+   /* initialize and set a long */
+   if ((result = mp_init_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2: \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("Number1, Number2 == \%lu, \%lu",
+          mp_get_int(&number1), mp_get_int(&number2));
+
+   /* clear */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}
+
+If this program succeeds it shall output.
+\begin{alltt}
+Number1, Number2 == 100, 1023
+\end{alltt}
+
 \section{Comparisons}

 Comparisons in LibTomMath are always performed in a ``left to right'' fashion.  There are three possible return codes
@ -644,13 +708,13 @@ $b$.

 An unsigned comparison considers only the digits themselves and not the associated \textit{sign} flag of the 
 mp\_int structures.  This is analogous to an absolute comparison.  The function mp\_cmp\_mag() will compare two
-mp\_int variables based on their digits only.
+mp\_int variables based on their digits only. 

 \index{mp\_cmp\_mag}
 \begin{alltt}
 int mp_cmp(mp_int * a, mp_int * b);
 \end{alltt}
-This will compare ``a'' to ``b'' placing ``a'' to the left of ``b''.  This function cannot fail and will return one of the
+This will compare $a$ to $b$ placing $a$ to the left of $b$.  This function cannot fail and will return one of the
 three compare codes listed in figure \ref{fig:CMP}.

 \begin{small} \begin{alltt}
@ -707,7 +771,7 @@ To compare two mp\_int variables based on their signed value the mp\_cmp() funct
 int mp_cmp(mp_int * a, mp_int * b);
 \end{alltt}

-This will compare ``a'' to the left of ``b''.  It will first compare the signs of the two mp\_int variables.  If they
+This will compare $a$ to the left of $b$.  It will first compare the signs of the two mp\_int variables.  If they
 differ it will return immediately based on their signs.  If the signs are equal then it will compare the digits
 individually.  This function will return one of the compare conditions codes listed in figure \ref{fig:CMP}.

@ -763,7 +827,7 @@ To compare a single digit against an mp\_int the following function has been pro
 int mp_cmp_d(mp_int * a, mp_digit b);
 \end{alltt}

-This will compare ``a'' to the left of ``b'' using a signed comparison.  Note that it will always treat ``b'' as 
+This will compare $a$ to the left of $b$ using a signed comparison.  Note that it will always treat $b$ as 
 positive.  This function is rather handy when you have to compare against small values such as $1$ (which often
 comes up in cryptography).  The function cannot fail and will return one of the tree compare condition codes
 listed in figure \ref{fig:CMP}.
@ -820,7 +884,7 @@ int mp_mul_2(mp_int * a, mp_int * b);
 int mp_div_2(mp_int * a, mp_int * b);
 \end{alltt}

-The former will assign twice ``a'' to ``b'' while the latter will assign half ``a'' to ``b''.  These functions are fast
+The former will assign twice $a$ to $b$ while the latter will assign half $a$ to $b$.  These functions are fast
 since the shift counts and maskes are hardcoded into the routines.

 \begin{small} \begin{alltt}
@ -883,8 +947,8 @@ Since $10 > 7$ and $5 < 7$.  To multiply by a power of two the following functio
 int mp_mul_2d(mp_int * a, int b, mp_int * c);
 \end{alltt}

-This will multiply ``a'' by $2^b$ and store the result in ``c''.  If the value of $b$ is less than or equal to 
-zero the function will copy ``a'' to ``c'' without performing any further actions.  
+This will multiply $a$ by $2^b$ and store the result in ``c''.  If the value of $b$ is less than or equal to 
+zero the function will copy $a$ to ``c'' without performing any further actions.  

 To divide by a power of two use the following.

@ -892,8 +956,8 @@ To divide by a power of two use the following.
 \begin{alltt}
 int mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d);
 \end{alltt}
-Which will divide ``a'' by $2^b$, store the quotient in ``c'' and the remainder in ``d'.  If $b \le 0$ then the
-function simply copies ``a'' over to ``c'' and zeroes ``d''.  The variable ``d'' may be passed as a \textbf{NULL}
+Which will divide $a$ by $2^b$, store the quotient in ``c'' and the remainder in ``d'.  If $b \le 0$ then the
+function simply copies $a$ over to ``c'' and zeroes $d$.  The variable $d$ may be passed as a \textbf{NULL}
 value to signal that the remainder is not desired.

 \subsection{Polynomial Basis Operations}
@ -911,14 +975,14 @@ following function provides this operation.
 int mp_lshd (mp_int * a, int b);
 \end{alltt}

-This will multiply ``a'' in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes
+This will multiply $a$ in place by $x^b$ which is equivalent to shifting the digits left $b$ places and inserting zeroes
 in the least significant digits.  Similarly to divide by a power of $x$ the following function is provided.

 \index{mp\_rshd}
 \begin{alltt}
 void mp_rshd (mp_int * a, int b)
 \end{alltt}
-This will divide ``a'' in place by $x^b$ and discard the remainder.  This function cannot fail as it performs the operations
+This will divide $a$ in place by $x^b$ and discard the remainder.  This function cannot fail as it performs the operations
 in place and no new digits are required to complete it.

 \subsection{AND, OR and XOR Operations}
@ -948,7 +1012,6 @@ int mp_sub (mp_int * a, mp_int * b, mp_int * c)
 Which perform $c = a \odot b$ where $\odot$ is one of signed addition or subtraction.  The operations are fully sign
 aware.

-
 \section{Sign Manipulation}
 \subsection{Negation}
 \label{sec:NEG}
@ -959,7 +1022,7 @@ Simple integer negation can be performed with the following.
 int mp_neg (mp_int * a, mp_int * b);
 \end{alltt}

-Which assigns $-b$ to $a$.  
+Which assigns $-a$ to $b$.  

 \subsection{Absolute}
 Simple integer absolutes can be performed with the following.
@ -969,7 +1032,20 @@ Simple integer absolutes can be performed with the following.
 int mp_abs (mp_int * a, mp_int * b);
 \end{alltt}

-Which assigns $\vert b \vert$ to $a$.  
+Which assigns $\vert a \vert$ to $b$.  
+
+\section{Integer Division and Remainder}
+To perform a complete and general integer division with remainder use the following function.
+
+\index{mp\_div}
+\begin{alltt}
+int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d);
+\end{alltt}
+                                                        
+This divides $a$ by $b$ and stores the quotient in $c$ and $d$.  The signed quotient is computed such that 
+$bc + d = a$.  Note that either of $c$ or $d$ can be set to \textbf{NULL} if their value is not required.  If 
+$b$ is zero the function returns \textbf{MP\_VAL}.  
+

 \chapter{Multiplication and Squaring}
 \section{Multiplication}
@ -986,6 +1062,57 @@ sized inputs.  Then followed by the Comba and baseline multipliers.
 Fortunately for the developer you don't really need to know this unless you really want to fine tune the system.  mp\_mul()
 will determine on its own\footnote{Some tweaking may be required.} what routine to use automatically when it is called.

+\begin{alltt}
+int main(void)
+\{
+   mp_int number1, number2;
+   int result;
+
+   /* Initialize the numbers */
+   if ((result = mp_init_multi(&number1, 
+                               &number2, NULL)) != MP_OKAY) \{
+      printf("Error initializing the numbers.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* set the terms */
+   if ((result = mp_set_int(&number, 257)) != MP_OKAY) \{
+      printf("Error setting number1.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+ 
+   if ((result = mp_set_int(&number2, 1023)) != MP_OKAY) \{
+      printf("Error setting number2.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* multiply them */
+   if ((result = mp_mul(&number1, &number2,
+                        &number1)) != MP_OKAY) \{
+      printf("Error multiplying terms.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* display */
+   printf("number1 * number2 == \%lu", mp_get_int(&number1));
+
+   /* free terms and return */
+   mp_clear_multi(&number1, &number2, NULL);
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt}   
+
+If this program succeeds it shall output the following.
+
+\begin{alltt}
+number1 * number2 == 262911
+\end{alltt}
+
 \section{Squaring}
 Since squaring can be performed faster than multiplication it is performed it's own function instead of just using
 mp\_mul().
@ -995,12 +1122,12 @@ mp\_mul().
 int mp_sqr (mp_int * a, mp_int * b);
 \end{alltt}

-Will square ``a'' and store it in ``b''.  Like the case of multiplication there are four different squaring
-algorithms all which can be called from mp\_sqr().
+Will square $a$ and store it in $b$.  Like the case of multiplication there are four different squaring
+algorithms all which can be called from mp\_sqr().  It is ideal to use mp\_sqr over mp\_mul when squaring terms.

 \section{Tuning Polynomial Basis Routines}

-Both Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
+Both of the Toom-Cook and Karatsuba multiplication algorithms are faster than the traditional $O(n^2)$ approach that
 the Comba and baseline algorithms use.  At $O(n^{1.464973})$ and $O(n^{1.584962})$ running times respectfully they require 
 considerably less work.  For example, a 10000-digit multiplication would take roughly 724,000 single precision
 multiplications with Toom-Cook or 100,000,000 single precision multiplications with the standard Comba (a factor
@ -1044,30 +1171,286 @@ good Karatsuba squaring and multiplication points.  Then it proceeds to find Too
 tuning takes a very long time as the cutoff points are likely to be very high.

 \chapter{Modular Reduction}
-\section{Integer Division and Remainder}
-To perform a complete and general integer division with remainder use the following function.

-\index{mp\_div}
+Modular reduction is process of taking the remainder of one quantity divided by another.  Expressed 
+as (\ref{eqn:mod}) the modular reduction is equivalent to the remainder of $b$ divided by $c$.  
+
+\begin{equation}
+a \equiv b \mbox{ (mod }c\mbox{)}
+\label{eqn:mod}
+\end{equation}
+
+Of particular interest to cryptography are reductions where $b$ is limited to the range $0 \le b < c^2$ since particularly 
+fast reduction algorithms can be written for the limited range.  
+
+Note that one of the four optimized reduction algorithms are automatically chosen in the modular exponentiation
+algorithm mp\_exptmod when an appropriate modulus is detected.  
+
+\section{Straight Division}
+In order to effect an arbitrary modular reduction the following algorithm is provided.
+
+\index{mp\_mod}
 \begin{alltt}
-int mp_div (mp_int * a, mp_int * b, mp_int * c, mp_int * d);
+int mp_mod(mp_int *a, mp_int *b, mp_int *c);
 \end{alltt}

-This divides ``a'' by ``b'' and stores the quotient in ``c'' and ``d''.  The signed quotient is computed such that 
-$bc + d = a$.  Note that either of ``c'' or ``d'' can be set to \textbf{NULL} if their value is not required.
+This reduces $a$ modulo $b$ and stores the result in $c$.  The sign of $c$ shall agree with the sign 
+of $b$.  This algorithm accepts an input $a$ of any range and is not limited by $0 \le a < b^2$.

 \section{Barrett Reduction}
+
+Barrett reduction is a generic optimized reduction algorithm that requires pre--computation to achieve
+a decent speedup over straight division.  First a $mu$ value must be precomputed with the following function.
+
+\index{mp\_reduce\_setup}
+\begin{alltt}
+int mp_reduce_setup(mp_int *a, mp_int *b);
+\end{alltt}
+
+Given a modulus in $b$ this produces the required $mu$ value in $a$.  For any given modulus this only has to
+be computed once.  Modular reduction can now be performed with the following.
+
+\index{mp\_reduce}
+\begin{alltt}
+int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
+\end{alltt}
+
+This will reduce $a$ in place modulo $b$ with the precomputed $mu$ value in $c$.  $a$ must be in the range
+$0 \le a < b^2$.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, mu;
+   int      result;
+
+   /* initialize a,b to desired values, mp_init mu, 
+    * c and set c to 1...we want to compute a^3 mod b 
+    */
+
+   /* get mu value */
+   if ((result = mp_reduce_setup(&mu, b)) != MP_OKAY) \{
+      printf("Error getting mu.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' modulo b  */
+   if ((result = mp_reduce(&c, &b, &mu)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+  
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This program will calculate $a^3 \mbox{ mod }b$ if all the functions succeed.  
+
 \section{Montgomery Reduction}
+
+Montgomery is a specialized reduction algorithm for any odd moduli.  Like Barrett reduction a pre--computation
+step is required.  This is accomplished with the following.
+
+\index{mp\_montgomery\_setup}
+\begin{alltt}
+int mp_montgomery_setup(mp_int *a, mp_digit *mp);
+\end{alltt}
+
+For the given odd moduli $a$ the precomputation value is placed in $mp$.  The reduction is computed with the 
+following.
+
+\index{mp\_montgomery\_reduce}
+\begin{alltt}
+int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
+\end{alltt}
+This reduces $a$ in place modulo $m$ with the pre--computed value $mp$.   $a$ must be in the range
+$0 \le a < b^2$.
+
+Montgomery reduction is faster than Barrett reduction for moduli smaller than the ``comba'' limit.  With the default
+setup for instance, the limit is $127$ digits ($3556$--bits).   Note that this function is not limited to
+$127$ digits just that it falls back to a baseline algorithm after that point.  
+
+An important observation is that this reduction does not return $a \mbox{ mod }m$ but $aR^{-1} \mbox{ mod }m$ 
+where $R = \beta^n$, $n$ is the n number of digits in $m$ and $\beta$ is radix used (default is $2^{28}$).  
+
+To quickly calculate $R$ the following function was provided.
+
+\index{mp\_montgomery\_calc\_normalization}
+\begin{alltt}
+int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
+\end{alltt}
+Which calculates $a = R$ for the odd moduli $b$ without using multiplication or division.  
+
+The normal modus operandi for Montgomery reductions is to normalize the integers before entering the system.  For
+example, to calculate $a^3 \mbox { mod }b$ using Montgomery reduction the value of $a$ can be normalized by
+multiplying it by $R$.  Consider the following code snippet.
+
+\begin{alltt}
+int main(void)
+\{
+   mp_int   a, b, c, R;
+   mp_digit mp;
+   int      result;
+
+   /* initialize a,b to desired values, 
+    * mp_init R, c and set c to 1.... 
+    */
+
+   /* get normalization */
+   if ((result = mp_montgomery_calc_normalization(&R, b)) != MP_OKAY) \{
+      printf("Error getting norm.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* get mp value */
+   if ((result = mp_montgomery_setup(&c, &mp)) != MP_OKAY) \{
+      printf("Error setting up montgomery.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* normalize `a' so now a is equal to aR */
+   if ((result = mp_mulmod(&a, &R, &b, &a)) != MP_OKAY) \{
+      printf("Error computing aR.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* square a to get c = a^2R^2 */
+   if ((result = mp_sqr(&a, &c)) != MP_OKAY) \{
+      printf("Error squaring.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^2R^2 * R^-1 == a^2R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* multiply a to get c = a^3R^2 */
+   if ((result = mp_mul(&a, &c, &c)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* now reduce `c' back down to c = a^3R^2 * R^-1 == a^3R */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+   
+   /* now reduce (again) `c' back down to c = a^3R * R^-1 == a^3 */
+   if ((result = mp_montgomery_reduce(&c, &b, mp)) != MP_OKAY) \{
+      printf("Error reducing.  \%s", 
+             mp_error_to_string(result));
+      return EXIT_FAILURE;
+   \}
+
+   /* c now equals a^3 mod b */
+
+   return EXIT_SUCCESS;
+\}
+\end{alltt} 
+
+This particular example does not look too efficient but it demonstrates the point of the algorithm.  By 
+normalizing the inputs the reduced results are always of the form $aR$ for some variable $a$.  This allows
+a single final reduction to correct for the normalization and the fast reduction used within the algorithm.
+
+For more details consider examining the file \textit{bn\_mp\_exptmod\_fast.c}.
+
 \section{Restricted Dimminished Radix}
+
+``Dimminished Radix'' reduction refers to reduction with respect to moduli that are ameniable to simple
+digit shifting and small multiplications.  In this case the ``restricted'' variant refers to moduli of the
+form $\beta^k - p$ for some $k \ge 0$ and $0 < p < \beta$ where $\beta$ is the radix (default to $2^{28}$).  
+
+As in the case of Montgomery reduction there is a pre--computation phase required for a given modulus.
+
+\index{mp\_dr\_setup}
+\begin{alltt}
+void mp_dr_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This computes the value required for the modulus $a$ and stores it in $d$.  This function cannot fail
+and does not return any error codes.  After the pre--computation a reduction can be performed with the
+following.
+
+\index{mp\_dr\_reduce}
+\begin{alltt}
+int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
+\end{alltt}
+
+This reduces $a$ in place modulo $b$ with the pre--computed value $mp$.  $b$ must be of a restricted
+dimminished radix form and $a$ must be in the range $0 \le a < b^2$.  Dimminished radix reductions are 
+much faster than both Barrett and Montgomery reductions as they have a much lower asymtotic running time.  
+
+Since the moduli are restricted this algorithm is not particularly useful for something like Rabin, RSA or
+BBS cryptographic purposes.  This reduction algorithm is useful for Diffie-Hellman and ECC where fixed
+primes are acceptable.  
+
+Note that unlike Montgomery reduction there is no normalization process.  The result of this function is
+equal to the correct residue.
+
 \section{Unrestricted Dimminshed Radix}

+Unrestricted reductions work much like the restricted counterparts except in this case the moduli is of the 
+form $2^k - p$ for $0 < p < \beta$.  In this sense the unrestricted reductions are more flexible as they 
+can be applied to a wider range of numbers.  
+
+\index{mp\_reduce\_2k\_setup}
+\begin{alltt}
+int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
+\end{alltt}
+
+This will compute the required $d$ value for the given moduli $a$.  
+
+\index{mp\_reduce\_2k}
+\begin{alltt}
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);
+\end{alltt}
+
+This will reduce $a$ in place modulo $n$ with the pre--computed value $d$.  From my experience this routine is 
+slower than mp\_dr\_reduce but faster for most moduli sizes than the Montgomery reduction.  
+
 \chapter{Exponentiation}
 \section{Single Digit Exponentiation}
 \index{mp\_expt\_d}
 \begin{alltt}
 int mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
 \end{alltt}
-This computes $c = a^b$ using a simple binary left-to-write algorithm.  It is faster than repeated multiplications for
-all values of $b$ greater than three.  
+This computes $c = a^b$ using a simple binary left-to-right algorithm.  It is faster than repeated multiplications by 
+$a$ for all values of $b$ greater than three.  

 \section{Modular Exponentiation}
 \index{mp\_exptmod}
@ -1077,7 +1460,12 @@ int mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
 This computes $Y \equiv G^X \mbox{ (mod }P\mbox{)}$ using a variable width sliding window algorithm.  This function
 will automatically detect the fastest modular reduction technique to use during the operation.  For negative values of 
 $X$ the operation is performed as $Y \equiv (G^{-1} \mbox{ mod }P)^{\vert X \vert} \mbox{ (mod }P\mbox{)}$ provided that 
-$gcd(G, P) = 1$.  
+$gcd(G, P) = 1$.
+
+This function is actually a shell around the two internal exponentiation functions.  This routine will automatically
+detect when Barrett, Montgomery, Restricted and Unrestricted Dimminished Radix based exponentiation can be used.  Generally
+moduli of the a ``restricted dimminished radix'' form lead to the fastest modular exponentiations.  Followed by Montgomery
+and the other two algorithms.

 \section{Root Finding}
 \index{mp\_n\_root}
@ -1090,6 +1478,11 @@ numbers (less than 1000 bits) I'd avoid $b > 3$ situations.  Will return a posit
 a root with the sign of the input for odd roots.  For example, performing $4^{1/2}$ will return $2$ whereas $(-8)^{1/3}$ 
 will return $-2$.  

+This algorithm uses the ``Newton Approximation'' method and will converge on the correct root fairly quickly.  Since
+the algorithm requires raising $a$ to the power of $b$ it is not ideal to attempt to find roots for large
+values of $b$.  If particularly large roots are required then a factor method could be used instead.  For example,
+$a^{1/16}$ is equivalent to $\left (a^{1/4} \right)^{1/4}$.
+
 \chapter{Prime Numbers}
 \section{Trial Division}
 \index{mp\_prime\_is\_divisible}
@ -1168,10 +1561,44 @@ typedef int ltm_prime_callback(unsigned char *dst, int len, void *dat);
 \end{alltt}

 Which is a function that must read $len$ bytes (and return the amount stored) into $dst$.  The $dat$ variable is simply
-copied from the original input.  It can be used to pass RNG context data to the callback.  
+copied from the original input.  It can be used to pass RNG context data to the callback.  The function 
+mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since there 
+is no skew on the least significant bits.

-The function mp\_prime\_random() is more suitable for generating primes which must be secret (as in the case of RSA) since
-there is no skew on the least significant bits.
+\textit{Note:}  As of v0.30 of the LibTomMath library this function has been deprecated.  It is still available
+but users are encouraged to use the new mp\_prime\_random\_ex() function instead.
+
+\subsection{Extended Generation}
+\index{mp\_prime\_random\_ex}
+\begin{alltt}
+int mp_prime_random_ex(mp_int *a,    int t, 
+                       int     size, int flags, 
+                       ltm_prime_callback cb, void *dat);
+\end{alltt}
+This will generate a prime in $a$ using $t$ tests of the primality testing algorithms.  The variable $size$
+specifies the bit length of the prime desired.  The variable $flags$ specifies one of several options available
+(see fig. \ref{fig:primeopts}) which can be OR'ed together.  The callback parameters are used as in 
+mp\_prime\_random().
+
+\begin{figure}[here]
+\begin{center}
+\begin{small}
+\begin{tabular}{|r|l|}
+\hline \textbf{Flag}         & \textbf{Meaning} \\
+\hline LTM\_PRIME\_BBS       & Make the prime congruent to $3$ modulo $4$ \\
+\hline LTM\_PRIME\_SAFE      & Make a prime $p$ such that $(p - 1)/2$ is also prime. \\
+                             & This option implies LTM\_PRIME\_BBS as well. \\
+\hline LTM\_PRIME\_2MSB\_OFF & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to zero.  \\
+\hline LTM\_PRIME\_2MSB\_ON  & Makes sure that the bit adjacent to the most significant bit \\
+                             & Is forced to one. \\
+\hline
+\end{tabular}
+\end{small}
+\end{center}
+\caption{Primality Generation Options}
+\label{fig:primeopts}
+\end{figure}

 \chapter{Input and Output}
 \section{ASCII Conversions}
@ -1180,7 +1607,7 @@ there is no skew on the least significant bits.
 \begin{alltt}
 int mp_toradix (mp_int * a, char *str, int radix);
 \end{alltt}
-This still store ``a'' in ``str'' as a base-``radix'' string of ASCII chars.  This function appends a NUL character
+This still store $a$ in ``str'' as a base-``radix'' string of ASCII chars.  This function appends a NUL character
 to terminate the string.  Valid values of ``radix'' line in the range $[2, 64]$.  To determine the size (exact) required
 by the conversion before storing any data use the following function.

@ -1196,12 +1623,46 @@ function returns an error code and ``size'' will be zero.
 \begin{alltt}
 int mp_read_radix (mp_int * a, char *str, int radix);
 \end{alltt}
-This will read the base-``radix'' NUL terminated string from ``str'' into ``a''.  It will stop reading when it reads a
+This will read the base-``radix'' NUL terminated string from ``str'' into $a$.  It will stop reading when it reads a
 character it does not recognize (which happens to include th NUL char... imagine that...).  A single leading $-$ sign
 can be used to denote a negative number.

 \section{Binary Conversions}
-\section{Stream Functions}
+
+Converting an mp\_int to and from binary is another keen idea.
+
+\index{mp\_unsigned\_bin\_size}
+\begin{alltt}
+int mp_unsigned_bin_size(mp_int *a);
+\end{alltt}
+
+This will return the number of bytes (octets) required to store the unsigned copy of the integer $a$.
+
+\index{mp\_to\_unsigned\_bin}
+\begin{alltt}
+int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+This will store $a$ into the buffer $b$ in big--endian format.  Fortunately this is exactly what DER (or is it ASN?)
+requires.  It does not store the sign of the integer.
+
+\index{mp\_read\_unsigned\_bin}
+\begin{alltt}
+int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
+\end{alltt}
+This will read in an unsigned big--endian array of bytes (octets) from $b$ of length $c$ into $a$.  The resulting
+integer $a$ will always be positive.
+
+For those who acknowledge the existence of negative numbers (heretic!) there are ``signed'' versions of the
+previous functions.
+
+\begin{alltt}
+int mp_signed_bin_size(mp_int *a);
+int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
+int mp_to_signed_bin(mp_int *a, unsigned char *b);
+\end{alltt}
+They operate essentially the same as the unsigned copies except they prefix the data with zero or non--zero
+byte depending on the sign.  If the sign is zpos (e.g. not negative) the prefix is zero, otherwise the prefix
+is non--zero.  

 \chapter{Algebraic Functions}
 \section{Extended Euclidean Algorithm}
@ -1217,6 +1678,8 @@ This finds the triple U1/U2/U3 using the Extended Euclidean algorithm such that
 a \cdot U1 + b \cdot U2 = U3
 \end{equation}

+Any of the U1/U2/U3 paramters can be set to \textbf{NULL} if they are not desired.  
+
 \section{Greatest Common Divisor}
 \index{mp\_gcd}
 \begin{alltt}
@ -1248,10 +1711,23 @@ int mp_invmod (mp_int * a, mp_int * b, mp_int * c)
 \end{alltt}
 Computes the multiplicative inverse of $a$ modulo $b$ and stores the result in $c$ such that $ac \equiv 1 \mbox{ (mod }b\mbox{)}$.

-
-
 \section{Single Digit Functions}

+For those using small numbers (\textit{snicker snicker}) there are several ``helper'' functions
+
+\index{mp\_add\_d} \index{mp\_sub\_d} \index{mp\_mul\_d} \index{mp\_div\_d} \index{mp\_mod\_d}
+\begin{alltt}
+int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
+int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
+int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
+\end{alltt}
+
+These work like the full mp\_int capable variants except the second parameter $b$ is a mp\_digit.  These
+functions fairly handy if you have to work with relatively small numbers since you will not have to allocate
+an entire mp\_int to store a number like $1$ or $2$.
+
 \input{bn.ind}

 \end{document}
--- a/bn_fast_s_mp_sqr.c
+++ b/bn_fast_s_mp_sqr.c
@ -31,8 +31,7 @@
 * Based on Algorithm 14.16 on pp.597 of HAC.
 *
 */
-int
-fast_s_mp_sqr (mp_int * a, mp_int * b)
+int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
  int     olduse, newused, res, ix, pa;
  mp_word W2[MP_WARRAY], W[MP_WARRAY];
--- a/bn_mp_cnt_lsb.c
+++ b/bn_mp_cnt_lsb.c
@ -14,11 +14,15 @@
 */
 #include <tommath.h>

+static const int lnz[16] = { 
+   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+};
+
 /* Counts the number of lsbs which are zero before the first zero bit */
 int mp_cnt_lsb(mp_int *a)
 {
   int x;
-   mp_digit q;
+   mp_digit q, qq;

   /* easy out */
   if (mp_iszero(a) == 1) {
@ -31,11 +35,13 @@ int mp_cnt_lsb(mp_int *a)
   x *= DIGIT_BIT;

   /* now scan this digit until a 1 is found */
-   while ((q & 1) == 0) {
-      q >>= 1;
-      x  += 1;
+   if ((q & 1) == 0) {
+      do {
+         qq  = q & 15;
+         x  += lnz[qq];
+         q >>= 4;
+      } while (qq == 0);
   }
-
   return x;
 }

--- a/bn_mp_exteuclid.c
+++ b/bn_mp_exteuclid.c
@ -1,69 +1,69 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Extended euclidean algorithm of (a, b) produces 
-   a*u1 + b*u2 = u3
- */
-int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
-{
-   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
-   int err;
-
-   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* initialize, (u1,u2,u3) = (1,0,a) */
-   mp_set(&u1, 1);
-   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* initialize, (v1,v2,v3) = (0,1,b) */
-   mp_set(&v2, 1);
-   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* loop while v3 != 0 */
-   while (mp_iszero(&v3) == MP_NO) {
-       /* q = u3/v3 */
-       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
-
-       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
-       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
-
-       /* (u1,u2,u3) = (v1,v2,v3) */
-       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
-
-       /* (v1,v2,v3) = (t1,t2,t3) */
-       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
-   }
-
-   /* copy result out */
-   if (U1 != NULL) { mp_exch(U1, &u1); }
-   if (U2 != NULL) { mp_exch(U2, &u2); }
-   if (U3 != NULL) { mp_exch(U3, &u3); }
-
-   err = MP_OKAY;
-_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
-   return err;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Extended euclidean algorithm of (a, b) produces 
+   a*u1 + b*u2 = u3
+ */
+int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
+{
+   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
+   int err;
+
+   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* initialize, (u1,u2,u3) = (1,0,a) */
+   mp_set(&u1, 1);
+   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* initialize, (v1,v2,v3) = (0,1,b) */
+   mp_set(&v2, 1);
+   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* loop while v3 != 0 */
+   while (mp_iszero(&v3) == MP_NO) {
+       /* q = u3/v3 */
+       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
+
+       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
+       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
+
+       /* (u1,u2,u3) = (v1,v2,v3) */
+       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
+
+       /* (v1,v2,v3) = (t1,t2,t3) */
+       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
+   }
+
+   /* copy result out */
+   if (U1 != NULL) { mp_exch(U1, &u1); }
+   if (U2 != NULL) { mp_exch(U2, &u2); }
+   if (U3 != NULL) { mp_exch(U3, &u3); }
+
+   err = MP_OKAY;
+_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
+   return err;
+}
--- a/bn_mp_fwrite.c
+++ b/bn_mp_fwrite.c
@ -23,7 +23,7 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream)
      return err;
   }

-   buf = XMALLOC (len);
+   buf = OPT_CAST(char) XMALLOC (len);
   if (buf == NULL) {
      return MP_MEM;
   }
--- a/bn_mp_get_int.c
+++ b/bn_mp_get_int.c
@ -0,0 +1,39 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the lower 32-bits of an mp_int */
+unsigned long mp_get_int(mp_int * a) 
+{
+  int i;
+  unsigned long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+   
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+
+  /* force result to 32-bits always so it is consistent on non 32-bit platforms */
+  return res & 0xFFFFFFFFUL;
+}
--- a/bn_mp_grow.c
+++ b/bn_mp_grow.c
@ -31,7 +31,7 @@ int mp_grow (mp_int * a, int size)
     * in case the operation failed we don't want
     * to overwrite the dp member of a.
     */
-    tmp = OPT_CAST XREALLOC (a->dp, sizeof (mp_digit) * size);
+    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
    if (tmp == NULL) {
      /* reallocation failed but "a" is still valid [can be freed] */
      return MP_MEM;
--- a/bn_mp_init.c
+++ b/bn_mp_init.c
@ -18,7 +18,7 @@
 int mp_init (mp_int * a)
 {
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }
--- a/bn_mp_init_set.c
+++ b/bn_mp_init_set.c
@ -0,0 +1,26 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  mp_set(a, b);
+  return err;
+}
--- a/bn_mp_init_set_int.c
+++ b/bn_mp_init_set_int.c
@ -0,0 +1,25 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* initialize and set a digit */
+int mp_init_set_int (mp_int * a, unsigned long b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  return mp_set_int(a, b);
+}
--- a/bn_mp_init_size.c
+++ b/bn_mp_init_size.c
@ -21,7 +21,7 @@ int mp_init_size (mp_int * a, int size)
  size += (MP_PREC * 2) - (size % MP_PREC);	
  
  /* alloc mem */
-  a->dp = OPT_CAST XCALLOC (sizeof (mp_digit), size);
+  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), size);
  if (a->dp == NULL) {
    return MP_MEM;
  }
--- a/bn_mp_is_square.c
+++ b/bn_mp_is_square.c
@ -0,0 +1,103 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Check if remainders are possible squares - fast exclude non-squares */
+static const char rem_128[128] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1
+};
+
+static const char rem_105[105] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+ 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
+};
+
+/* Store non-zero to ret if arg is square, and zero if not */
+int mp_is_square(mp_int *arg,int *ret) 
+{
+  int           res;
+  mp_digit      c;
+  mp_int        t;
+  unsigned long r;
+
+  /* Default to Non-square :) */
+  *ret = MP_NO; 
+
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* digits used?  (TSD) */
+  if (arg->used == 0) {
+     return MP_OKAY;
+  }
+
+  /* First check mod 128 (suppose that DIGIT_BIT is at least 7) */
+  if (rem_128[127 & DIGIT(arg,0)] == 1) {
+     return MP_OKAY;
+  }
+
+  /* Next check mod 105 (3*5*7) */
+  if ((res = mp_mod_d(arg,105,&c)) != MP_OKAY) {
+     return res;
+  }
+  if (rem_105[c] == 1) {
+     return MP_OKAY;
+  }
+
+  /* product of primes less than 2^31 */
+  if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
+     return res;
+  }
+  if ((res = mp_mod(arg,&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  r = mp_get_int(&t);
+  /* Check for other prime modules, note it's not an ERROR but we must
+   * free "t" so the easiest way is to goto ERR.  We know that res
+   * is already equal to MP_OKAY from the mp_mod call 
+   */ 
+  if ( (1L<<(r%11)) & 0x5C4L )             goto ERR;
+  if ( (1L<<(r%13)) & 0x9E4L )             goto ERR;
+  if ( (1L<<(r%17)) & 0x5CE8L )            goto ERR;
+  if ( (1L<<(r%19)) & 0x4F50CL )           goto ERR;
+  if ( (1L<<(r%23)) & 0x7ACCA0L )          goto ERR;
+  if ( (1L<<(r%29)) & 0xC2EDD0CL )         goto ERR;
+  if ( (1L<<(r%31)) & 0x6DE2B848L )        goto ERR;
+
+  /* Final check - is sqr(sqrt(arg)) == arg ? */
+  if ((res = mp_sqrt(arg,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  if ((res = mp_sqr(&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+
+  *ret = (mp_cmp_mag(&t,arg) == MP_EQ) ? MP_YES : MP_NO;
+ERR:mp_clear(&t);
+  return res;
+}
--- a/bn_mp_karatsuba_mul.c
+++ b/bn_mp_karatsuba_mul.c
@ -43,8 +43,7 @@
 * Generally though the overhead of this method doesn't pay off 
 * until a certain size (N ~ 80) is reached.
 */
-int
-mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
  int     B, err;
@ -56,7 +55,7 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
  B = MIN (a->used, b->used);

  /* now divide in two */
-  B = B / 2;
+  B = B >> 1;

  /* init copy all the temps */
  if (mp_init_size (&x0, B) != MP_OKAY)
--- a/bn_mp_karatsuba_sqr.c
+++ b/bn_mp_karatsuba_sqr.c
@ -21,8 +21,7 @@
 * is essentially the same algorithm but merely 
 * tuned to perform recursive squarings.
 */
-int
-mp_karatsuba_sqr (mp_int * a, mp_int * b)
+int mp_karatsuba_sqr (mp_int * a, mp_int * b)
 {
  mp_int  x0, x1, t1, t2, x0x0, x1x1;
  int     B, err;
@ -33,7 +32,7 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)
  B = a->used;

  /* now divide in two */
-  B = B / 2;
+  B = B >> 1;

  /* init copy all the temps */
  if (mp_init_size (&x0, B) != MP_OKAY)
--- a/bn_mp_mod.c
+++ b/bn_mp_mod.c
@ -21,7 +21,6 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
  mp_int  t;
  int     res;

-
  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
  }
@ -31,7 +30,7 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
    return res;
  }

-  if (t.sign == MP_NEG) {
+  if (t.sign != b->sign) {
    res = mp_add (b, &t, c);
  } else {
    res = MP_OKAY;
--- a/bn_mp_n_root.c
+++ b/bn_mp_n_root.c
@ -101,7 +101,7 @@ int mp_n_root (mp_int * a, mp_digit b, mp_int * c)

    if (mp_cmp (&t2, a) == MP_GT) {
      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-    goto __T3;
+         goto __T3;
      }
    } else {
      break;
--- a/bn_mp_neg.c
+++ b/bn_mp_neg.c
@ -21,7 +21,7 @@ int mp_neg (mp_int * a, mp_int * b)
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
-  if (mp_iszero(b) != 1) {
+  if (mp_iszero(b) != MP_YES) {
     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
  }
  return MP_OKAY;
--- a/bn_mp_prime_random.c
+++ b/bn_mp_prime_random.c
@ -1,74 +0,0 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* makes a truly random prime of a given size (bytes),
- * call with bbs = 1 if you want it to be congruent to 3 mod 4 
- *
- * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
- * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
- * so it can be NULL
- *
- * The prime generated will be larger than 2^(8*size).
- */
-
-/* this sole function may hold the key to enslaving all mankind! */
-int mp_prime_random(mp_int *a, int t, int size, int bbs, ltm_prime_callback cb, void *dat)
-{
-   unsigned char *tmp;
-   int res, err;
-
-   /* sanity check the input */
-   if (size <= 0) {
-      return MP_VAL;
-   }
-
-   /* we need a buffer of size+1 bytes */
-   tmp = XMALLOC(size+1);
-   if (tmp == NULL) {
-      return MP_MEM;
-   }
-
-   /* fix MSB */
-   tmp[0] = 1;
-
-   do {
-      /* read the bytes */
-      if (cb(tmp+1, size, dat) != size) {
-         err = MP_VAL;
-         goto error;
-      }
- 
-      /* fix the LSB */
-      tmp[size] |= (bbs ? 3 : 1);
-
-      /* read it in */
-      if ((err = mp_read_unsigned_bin(a, tmp, size+1)) != MP_OKAY) {
-         goto error;
-      }
-
-      /* is it prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         goto error;
-      }
-   } while (res == MP_NO);
-
-   err = MP_OKAY;
-error:
-   XFREE(tmp);
-   return err;
-}
-
-
--- a/bn_mp_prime_random_ex.c
+++ b/bn_mp_prime_random_ex.c
@ -0,0 +1,118 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ */
+
+/* This is possibly the mother of all prime generation functions, muahahahahaha! */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
+{
+   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
+   int res, err, bsize, maskOR_msb_offset;
+
+   /* sanity check the input */
+   if (size <= 1 || t <= 0) {
+      return MP_VAL;
+   }
+
+   /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
+   if (flags & LTM_PRIME_SAFE) {
+      flags |= LTM_PRIME_BBS;
+   }
+
+   /* calc the byte size */
+   bsize = (size>>3)+(size&7?1:0);
+
+   /* we need a buffer of bsize bytes */
+   tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
+   if (tmp == NULL) {
+      return MP_MEM;
+   }
+
+   /* calc the maskAND value for the MSbyte*/
+   maskAND = 0xFF >> (8 - (size & 7));
+
+   /* calc the maskOR_msb */
+   maskOR_msb        = 0;
+   maskOR_msb_offset = (size - 2) >> 3;
+   if (flags & LTM_PRIME_2MSB_ON) {
+      maskOR_msb     |= 1 << ((size - 2) & 7);
+   } else if (flags & LTM_PRIME_2MSB_OFF) {
+      maskAND        &= ~(1 << ((size - 2) & 7));
+   }
+
+   /* get the maskOR_lsb */
+   maskOR_lsb         = 0;
+   if (flags & LTM_PRIME_BBS) {
+      maskOR_lsb     |= 3;
+   }
+
+   do {
+      /* read the bytes */
+      if (cb(tmp, bsize, dat) != bsize) {
+         err = MP_VAL;
+         goto error;
+      }
+ 
+      /* work over the MSbyte */
+      tmp[0]    &= maskAND;
+      tmp[0]    |= 1 << ((size - 1) & 7);
+
+      /* mix in the maskORs */
+      tmp[maskOR_msb_offset]   |= maskOR_msb;
+      tmp[bsize-1]             |= maskOR_lsb;
+
+      /* read it in */
+      if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }
+
+      /* is it prime? */
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+
+      if (flags & LTM_PRIME_SAFE) {
+         /* see if (a-1)/2 is prime */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
+         if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
+ 
+         /* is it prime? */
+         if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
+      }
+   } while (res == MP_NO);
+
+   if (flags & LTM_PRIME_SAFE) {
+      /* restore a to the original value */
+      if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
+   }
+
+   err = MP_OKAY;
+error:
+   XFREE(tmp);
+   return err;
+}
+
+
--- a/bn_mp_reduce_2k.c
+++ b/bn_mp_reduce_2k.c
@ -14,9 +14,9 @@
 */
 #include <tommath.h>

-/* reduces a modulo n where n is of the form 2**p - k */
+/* reduces a modulo n where n is of the form 2**p - d */
 int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
+mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
   mp_int q;
   int    p, res;
@ -32,9 +32,9 @@ top:
      goto ERR;
   }
   
-   if (k != 1) {
-      /* q = q * k */
-      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
+   if (d != 1) {
+      /* q = q * d */
+      if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) { 
         goto ERR;
      }
   }
--- a/bn_mp_shrink.c
+++ b/bn_mp_shrink.c
@ -19,7 +19,7 @@ int mp_shrink (mp_int * a)
 {
  mp_digit *tmp;
  if (a->alloc != a->used && a->used > 0) {
-    if ((tmp = OPT_CAST XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
      return MP_MEM;
    }
    a->dp    = tmp;
--- a/bn_mp_sqrt.c
+++ b/bn_mp_sqrt.c
@ -0,0 +1,75 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* this function is less generic than mp_n_root, simpler and faster */
+int mp_sqrt(mp_int *arg, mp_int *ret) 
+{
+  int res;
+  mp_int t1,t2;
+
+  /* must be positive */
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* easy out */
+  if (mp_iszero(arg) == MP_YES) {
+    mp_zero(ret);
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init(&t2)) != MP_OKAY) {
+    goto E2;
+  }
+
+  /* First approx. (not very bad for large arg) */
+  mp_rshd (&t1,t1.used/2);
+
+  /* t1 > 0  */ 
+  if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  /* And now t1 > sqrt(arg) */
+  do { 
+    if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    /* t1 >= sqrt(arg) >= t2 at this point */
+  } while (mp_cmp_mag(&t1,&t2) == MP_GT);
+
+  mp_exch(&t1,ret);
+
+E1: mp_clear(&t2);
+E2: mp_clear(&t1);
+  return res;
+}
+
--- a/bn_mp_toom_mul.c
+++ b/bn_mp_toom_mul.c
@ -15,8 +15,7 @@
 #include <tommath.h>

 /* multiplication using the Toom-Cook 3-way algorithm */
-int 
-mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
    int res, B;
--- a/bn_mp_toradix_n.c
+++ b/bn_mp_toradix_n.c
@ -0,0 +1,83 @@
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* stores a bignum as a ASCII string in a given radix (2..64) 
+ *
+ * Stores upto maxlen-1 chars and always a NULL byte 
+ */
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the maxlen, radix */
+  if (maxlen < 3 || radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    /* we have to reverse our digits later... but not the - sign!! */
+    ++_s;
+
+    /* store the flag and mark the number as positive */
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+ 
+    /* subtract a char */
+    --maxlen;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+
+    if (--maxlen == 1) {
+       /* no more room */
+       break;
+    }
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
--- a/bn_s_mp_sqr.c
+++ b/bn_s_mp_sqr.c
@ -26,8 +26,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
  pa = a->used;
  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
    return res;
-  }
-
+  }
+
  /* default used is maximum possible size */
  t.used = 2*pa + 1;

--- a/changes.txt
+++ b/changes.txt
@ -1,13 +1,32 @@
+April 11th, 2004
+v0.30  -- Added "mp_toradix_n" which stores upto "n-1" least significant digits of an mp_int
+       -- Johan Lindh sent a patch so MSVC wouldn't whine about redefining malloc [in weird dll modes]
+       -- Henrik Goldman spotted a missing OPT_CAST in mp_fwrite()
+       -- Tuned tommath.h so that when MP_LOW_MEM is defined MP_PREC shall be reduced.
+          [I also allow MP_PREC to be externally defined now]
+       -- Sped up mp_cnt_lsb() by using a 4x4 table [e.g. 4x speedup]
+       -- Added mp_prime_random_ex() which is a more versatile prime generator accurate to
+          exact bit lengths (unlike the deprecated but still available mp_prime_random() which
+          is only accurate to byte lengths).  See the new LTM_PRIME_* flags ;-)
+       -- Alex Polushin contributed an optimized mp_sqrt() as well as mp_get_int() and mp_is_square().
+          I've cleaned them all up to be a little more consistent [along with one bug fix] for this release.
+       -- Added mp_init_set and mp_init_set_int to initialize and set small constants with one function
+          call.
+       -- Removed /etclib directory [um LibTomPoly deprecates this].
+       -- Fixed mp_mod() so the sign of the result agrees with the sign of the modulus.
+       ++ N.B.  My semester is almost up so expect updates to the textbook to be posted to the libtomcrypt.org 
+          website.  
+
 Jan 25th, 2004
 v0.29  ++ Note: "Henrik" from the v0.28 changelog refers to Henrik Goldman ;-)
       -- Added fix to mp_shrink to prevent a realloc when used == 0 [e.g. realloc zero bytes???]
-       -- Made the mp_prime_rabin_miller_trials() function internal table smaller and also 
+       -- Made the mp_prime_rabin_miller_trials() function internal table smaller and also
          set the minimum number of tests to two (sounds a bit safer).
       -- Added a mp_exteuclid() which computes the extended euclidean algorithm.
       -- Fixed a memory leak in s_mp_exptmod() [called when Barrett reduction is to be used] which would arise
-          if a multiplication or subsequent reduction failed [would not free the temp result].  
-       -- Made an API change to mp_radix_size().  It now returns an error code and stores the required size 
-          through an "int star" passed to it.  
+          if a multiplication or subsequent reduction failed [would not free the temp result].
+       -- Made an API change to mp_radix_size().  It now returns an error code and stores the required size
+          through an "int star" passed to it.

 Dec 24th, 2003
 v0.28  -- Henrik Goldman suggested I add casts to the montomgery code [stores into mu...] so compilers wouldn't
@ -18,17 +37,17 @@ v0.28  -- Henrik Goldman suggested I add casts to the montomgery code [stores in
          (idea from chat with Niels Ferguson at Crypto'03)
       -- Picked up a second wind.  I'm filled with Gooo.  Mission Gooo!
       -- Removed divisions from mp_reduce_is_2k()
-       -- Sped up mp_div_d() [general case] to use only one division per digit instead of two.  
+       -- Sped up mp_div_d() [general case] to use only one division per digit instead of two.
       -- Added the heap macros from LTC to LTM.  Now you can easily [by editing four lines of tommath.h]
          change the name of the heap functions used in LTM [also compatible with LTC via MPI mode]
       -- Added bn_prime_rabin_miller_trials() which gives the number of Rabin-Miller trials to achieve
          a failure rate of less than 2^-96
       -- fixed bug in fast_mp_invmod().  The initial testing logic was wrong.  An invalid input is not when
-          "a" and "b" are even it's when "b" is even [the algo is for odd moduli only].  
+          "a" and "b" are even it's when "b" is even [the algo is for odd moduli only].
       -- Started a new manual [finally].  It is incomplete and will be finished as time goes on.  I had to stop
-          adding full demos around half way in chapter three so I could at least get a good portion of the 
+          adding full demos around half way in chapter three so I could at least get a good portion of the
          manual done.   If you really need help using the library you can always email me!
-       -- My Textbook is now included as part of the package [all Public Domain]       
+       -- My Textbook is now included as part of the package [all Public Domain]

 Sept 19th, 2003
 v0.27  -- Removed changes.txt~ which was made by accident since "kate" decided it was
@ -41,7 +60,7 @@ v0.27  -- Removed changes.txt~ which was made by accident since "kate" decided i

 Aug 29th, 2003
 v0.26  -- Fixed typo that caused warning with GCC 3.2
-       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.  
+       -- Martin Marcel noticed a bug in mp_neg() that allowed negative zeroes.
          Also, Martin is the fellow who noted the bugs in mp_gcd() of 0.24/0.25.
       -- Martin Marcel noticed an optimization [and slight bug] in mp_lcm().
       -- Added fix to mp_read_unsigned_bin to prevent a buffer overflow.
@ -102,18 +121,18 @@ v0.22  -- Fixed up mp_invmod so the result is properly in range now [was always

 June 19th, 2003
 v0.21  -- Fixed bug in mp_mul_d which would not handle sign correctly [would not always forward it]
-       -- Removed the #line lines from gen.pl [was in violation of ISO C]       
+       -- Removed the #line lines from gen.pl [was in violation of ISO C]

 June 8th, 2003
-v0.20  -- Removed the book from the package.  Added the TDCAL license document.  
+v0.20  -- Removed the book from the package.  Added the TDCAL license document.
       -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16]

 June 6th, 2003
 v0.19  -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release.
          Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time.
-       -- Fixed up etc/tune.c a bit to stop testing new cutoffs after 16 failures [to find more optimal points]. 
+       -- Fixed up etc/tune.c a bit to stop testing new cutoffs after 16 failures [to find more optimal points].
          Brute force ho!
-          
+

 May 29th, 2003
 v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not very elegantly.
@ -121,7 +140,7 @@ v0.18  -- Fixed a bug in s_mp_sqr which would handle carries properly just not v
       -- Fixed bug in mp_sqr which still had a 512 constant instead of MP_WARRAY
       -- Added Toom-Cook multipliers [needs tuning!]
       -- Added efficient divide by 3 algorithm mp_div_3
-       -- Re-wrote mp_div_d to be faster than calling mp_div 
+       -- Re-wrote mp_div_d to be faster than calling mp_div
       -- Added in a donated BCC makefile and a single page LTM poster (ahalhabsi@sbcglobal.net)
       -- Added mp_reduce_2k which reduces an input modulo n = 2**p - k for any single digit k
       -- Made the exptmod system be aware of the 2k reduction algorithms.
@ -134,13 +153,13 @@ v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A n
       -- Fixed bug in mp_exptmod that would cause it to fail for odd moduli when DIGIT_BIT != 28
       -- mp_exptmod now also returns errors if the modulus is negative and will handle negative exponents
       -- mp_prime_is_prime will now return true if the input is one of the primes in the prime table
-       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the 
-          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed) 
+       -- Damian M Gryski (dgryski@uwaterloo.ca) found a index out of bounds error in the
+          mp_fast_s_mp_mul_high_digs function which didn't come up before.  (fixed)
       -- Refactored the DR reduction code so there is only one function per file.
       -- Fixed bug in the mp_mul() which would erroneously avoid the faster multiplier [comba] when it was
          allowed.  The bug would not cause the incorrect value to be produced just less efficient (fixed)
       -- Fixed similar bug in the Montgomery reduction code.
-       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box. 
+       -- Added tons of (mp_digit) casts so the 7/15/28/31 bit digit code will work flawlessly out of the box.
          Also added limited support for 64-bit machines with a 60-bit digit.  Both thanks to Tom Wu (tom@arcot.com)
       -- Added new comments here and there, cleaned up some code [style stuff]
       -- Fixed a lingering typo in mp_exptmod* that would set bitcnt to zero then one.  Very silly stuff :-)
@ -148,19 +167,19 @@ v0.17  -- Benjamin Goldberg submitted optimized mp_add and mp_sub routines.  A n
          saves quite a few calls and if statements.
       -- Added etc/mont.c a test of the Montgomery reduction [assuming all else works :-| ]
       -- Fixed up etc/tune.c to use a wider test range [more appropriate] also added a x86 based addition which
-          uses RDTSC for high precision timing.  
-       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its 
+          uses RDTSC for high precision timing.
+       -- Updated demo/demo.c to remove MPI stuff [won't work anyways], made the tests run for 2 seconds each so its
          not so insanely slow.  Also made the output space delimited [and fixed up various errors]
-       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files 
-          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the 
+       -- Added logs directory, logs/graph.dem which will use gnuplot to make a series of PNG files
+          that go with the pre-made index.html.  You have to build [via make timing] and run ltmtest first in the
          root of the package.
-       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].  
+       -- Fixed a bug in mp_sub and mp_add where "-a - -a" or "-a + a" would produce -0 as the result [obviously invalid].
       -- Fixed a bug in mp_rshd.  If the count == a.used it should zero/return [instead of shifting]
       -- Fixed a "off-by-one" bug in mp_mul2d.  The initial size check on alloc would be off by one if the residue
-          shifting caused a carry.  
+          shifting caused a carry.
       -- Fixed a bug where s_mp_mul_digs() would not call the Comba based routine if allowed.  This made Barrett reduction
          slower than it had to be.
-          
+
 Mar 29th, 2003
 v0.16  -- Sped up mp_div by making normalization one shift call
       -- Sped up mp_mul_2d/mp_div_2d by aliasing pointers :-)
@ -190,9 +209,9 @@ v0.14  -- Tons of manual updates
          also fixed up timing demo to use a finer granularity for various functions
       -- fixed up demo/demo.c testing to pause during testing so my Duron won't catch on fire
          [stays around 31-35C during testing :-)]
-       
+
 Feb 13th, 2003
-v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate 
+v0.13  -- tons of minor speed-ups in low level add, sub, mul_2 and div_2 which propagate
          to other functions like mp_invmod, mp_div, etc...
       -- Sped up mp_exptmod_fast by using new code to find R mod m [e.g. B^n mod m]
       -- minor fixes
@ -212,12 +231,12 @@ v0.11  -- More subtle fixes
       -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process
       -- Sped up the montgomery reduction code quite a bit
       -- fixed up demo so when building timing for the x86 it assumes ELF format now
-       
+
 Jan 9th, 2003
-v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.  
+v0.10  -- Pekka Riikonen suggested fixes to the radix conversion code.
       -- Added baseline montgomery and comba montgomery reductions, sped up exptmods
          [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF]
-       
+
 Jan 6th, 2003
 v0.09  -- Updated the manual to reflect recent changes.  :-)
       -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib
@ -230,16 +249,16 @@ v0.08  -- Sped up the multipliers by moving the inner loop variables into a smal
       -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data
       -- Corrected some bugs where error messages were potentially ignored
       -- add etc/pprime.c program which makes numbers which are provably prime.
-       
+
 Jan 1st, 2003
 v0.07  -- Removed alot of heap operations from core functions to speed them up
       -- Added a root finding function [and mp_sqrt macro like from MPI]
-       -- Added more to manual 
+       -- Added more to manual

 Dec 31st, 2002
 v0.06  -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc...
       -- Cleaned up the header a bit more
-       
+
 Dec 30th, 2002
 v0.05  -- Builds with MSVC out of the box
       -- Fixed a bug in mp_invmod w.r.t. even moduli
@ -247,19 +266,19 @@ v0.05  -- Builds with MSVC out of the box
       -- Fixed up exptmod to use fewer multiplications
       -- Fixed up mp_init_size to use only one heap operation
          -- Note there is a slight "off-by-one" bug in the library somewhere
-             without the padding (see the source for comment) the library 
+             without the padding (see the source for comment) the library
             crashes in libtomcrypt.  Anyways a reasonable workaround is to pad the
             numbers which will always correct it since as the numbers grow the padding
             will still be beyond the end of the number
       -- Added more to the manual
-       
+
 Dec 29th, 2002
 v0.04  -- Fixed a memory leak in mp_to_unsigned_bin
       -- optimized invmod code
       -- Fixed bug in mp_div
       -- use exchange instead of copy for results
       -- added a bit more to the manual
-       
+
 Dec 27th, 2002
 v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits
       -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member.
@ -270,7 +289,7 @@ v0.03  -- Sped up s_mp_mul_high_digs by not computing the carries of the lower d
       -- Many many many many fixes
       -- Works in LibTomCrypt now :-)
       -- Added iterations to the timing demos... more accurate.
-       -- Tom needs a job.       
+       -- Tom needs a job.

 Dec 26th, 2002
 v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)
@ -279,7 +298,7 @@ v0.02  -- Fixed a few "slips" in the manual.  This is "LibTomMath" afterall :-)

 Dec 25th,2002
 v0.01  -- Initial release.  Gimme a break.
-       -- Todo list, 
+       -- Todo list,
           add details to manual [e.g. algorithms]
           more comments in code
           example programs
--- a/demo/demo.c
+++ b/demo/demo.c
@ -1,5 +1,7 @@
 #include <time.h>

+#define TESTING
+
 #ifdef IOWNANATHLON
 #include <unistd.h>
 #define SLEEP sleep(4)
@ -11,8 +13,45 @@

 #ifdef TIMER
 ulong64 _tt;
-void reset(void) { _tt = clock(); }
-ulong64 rdtsc(void) { return clock() - _tt; }
+
+#if defined(__i386__) || defined(_M_IX86) || defined(_M_AMD64)
+/* RDTSC from Scott Duplichan */
+static ulong64 TIMFUNC (void)
+   {
+   #if defined __GNUC__
+      #ifdef __i386__
+         ulong64 a;
+         __asm__ __volatile__ ("rdtsc ":"=A" (a));
+         return a;
+      #else /* gcc-IA64 version */
+         unsigned long result;
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         while (__builtin_expect ((int) result == -1, 0))
+         __asm__ __volatile__("mov %0=ar.itc" : "=r"(result) :: "memory");
+         return result;
+      #endif
+
+   // Microsoft and Intel Windows compilers
+   #elif defined _M_IX86
+     __asm rdtsc
+   #elif defined _M_AMD64
+     return __rdtsc ();
+   #elif defined _M_IA64
+     #if defined __INTEL_COMPILER
+       #include <ia64intrin.h>
+     #endif
+      return __getReg (3116);
+   #else
+     #error need rdtsc function for this build
+   #endif
+   }
+#else
+#define TIMFUNC clock
+#endif
+
+ulong64 rdtsc(void) { return TIMFUNC() - _tt; }
+void reset(void) { _tt = TIMFUNC(); }
+
 #endif

 void ndraw(mp_int *a, char *name)
@ -42,6 +81,13 @@ int lbit(void)
   }
 }

+int myrng(unsigned char *dst, int len, void *dat)
+{
+   int x;
+   for (x = 0; x < len; x++) dst[x] = rand() & 0xFF;
+   return len;
+}
+

 #define DO2(x) x; x;
 #define DO4(x) DO2(x); DO2(x);
@ -53,13 +99,12 @@ int main(void)
 {
   mp_int a, b, c, d, e, f;
   unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n,
-                 div2_n, mul2_n, add_d_n, sub_d_n;
+                 div2_n, mul2_n, add_d_n, sub_d_n, t;
   unsigned rr;
-   int cnt, ix, old_kara_m, old_kara_s;
+   int i, n, err, cnt, ix, old_kara_m, old_kara_s;

 #ifdef TIMER
-   int n;
-   ulong64 tt;
+   ulong64 tt, CLK_PER_SEC;
   FILE *log, *logb, *logc;
 #endif

@ -72,6 +117,127 @@ int main(void)

   srand(time(NULL));

+#ifdef TESTING
+  // test mp_get_int
+  printf("Testing: mp_get_int\n");
+  for(i=0;i<1000;++i) {
+    t = (unsigned long)rand()*rand()+1;
+    mp_set_int(&a,t);
+    if (t!=mp_get_int(&a)) { 
+      printf("mp_get_int() bad result!\n");
+      return 1;
+    }
+  }
+  mp_set_int(&a,0);
+  if (mp_get_int(&a)!=0)
+  { printf("mp_get_int() bad result!\n");
+    return 1;
+  }
+  mp_set_int(&a,0xffffffff);
+  if (mp_get_int(&a)!=0xffffffff)
+  { printf("mp_get_int() bad result!\n");
+    return 1;
+  }
+
+  // test mp_sqrt
+  printf("Testing: mp_sqrt\n");
+  for (i=0;i<10000;++i) { 
+    printf("%6d\r", i); fflush(stdout);
+    n = (rand()&15)+1;
+    mp_rand(&a,n);
+    if (mp_sqrt(&a,&b) != MP_OKAY)
+    { printf("mp_sqrt() error!\n");
+      return 1;
+    }
+    mp_n_root(&a,2,&a);
+    if (mp_cmp_mag(&b,&a) != MP_EQ)
+    { printf("mp_sqrt() bad result!\n");
+      return 1;
+    }
+  }
+
+  printf("\nTesting: mp_is_square\n");
+  for (i=0;i<100000;++i) {
+    printf("%6d\r", i); fflush(stdout);
+
+    /* test mp_is_square false negatives */
+    n = (rand()&7)+1;
+    mp_rand(&a,n);
+    mp_sqr(&a,&a);
+    if (mp_is_square(&a,&n)!=MP_OKAY) { 
+      printf("fn:mp_is_square() error!\n");
+      return 1;
+    }
+    if (n==0) { 
+      printf("fn:mp_is_square() bad result!\n");
+      return 1;
+    }
+
+    /* test for false positives */
+    mp_add_d(&a, 1, &a);
+    if (mp_is_square(&a,&n)!=MP_OKAY) { 
+      printf("fp:mp_is_square() error!\n");
+      return 1;
+    }
+    if (n==1) { 
+      printf("fp:mp_is_square() bad result!\n");
+      return 1;
+    }
+
+  }
+  printf("\n\n");
+#endif
+
+#ifdef TESTING 
+   /* test for size */
+   for (ix = 16; ix < 512; ix++) {
+       printf("Testing (not safe-prime): %9d bits    \r", ix); fflush(stdout);
+       err = mp_prime_random_ex(&a, 8, ix, (rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON, myrng, NULL);
+       if (err != MP_OKAY) {
+          printf("failed with err code %d\n", err);
+          return EXIT_FAILURE;
+       }
+       if (mp_count_bits(&a) != ix) {
+          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+          return EXIT_FAILURE;
+       }
+   }
+
+   for (ix = 16; ix < 512; ix++) {
+       printf("Testing (   safe-prime): %9d bits    \r", ix); fflush(stdout);
+       err = mp_prime_random_ex(&a, 8, ix, ((rand()&1)?LTM_PRIME_2MSB_OFF:LTM_PRIME_2MSB_ON)|LTM_PRIME_SAFE, myrng, NULL);
+       if (err != MP_OKAY) {
+          printf("failed with err code %d\n", err);
+          return EXIT_FAILURE;
+       }
+       if (mp_count_bits(&a) != ix) {
+          printf("Prime is %d not %d bits!!!\n", mp_count_bits(&a), ix);
+          return EXIT_FAILURE;
+       }
+       /* let's see if it's really a safe prime */
+       mp_sub_d(&a, 1, &a);
+       mp_div_2(&a, &a);
+       mp_prime_is_prime(&a, 8, &cnt);
+       if (cnt != MP_YES) {
+          printf("sub is not prime!\n");
+          return EXIT_FAILURE;
+       }
+   }
+
+   printf("\n\n");
+#endif
+
+#ifdef TESTING
+   mp_read_radix(&a, "123456", 10);
+   mp_toradix_n(&a, buf, 10, 3);
+   printf("a == %s\n", buf);
+   mp_toradix_n(&a, buf, 10, 4);
+   printf("a == %s\n", buf);
+   mp_toradix_n(&a, buf, 10, 30);
+   printf("a == %s\n", buf);
+#endif
+
+
 #if 0
   for (;;) {
      fgets(buf, sizeof(buf), stdin);
@ -97,12 +263,13 @@ int main(void)
 }
 #endif

-#if 0
+#ifdef TESTING
   /* test mp_cnt_lsb */
+   printf("testing mp_cnt_lsb...\n");
   mp_set(&a, 1);
-   for (ix = 0; ix < 128; ix++) {
+   for (ix = 0; ix < 1024; ix++) {
       if (mp_cnt_lsb(&a) != ix) {
-          printf("Failed at %d\n", ix);
+          printf("Failed at %d, %d\n", ix, mp_cnt_lsb(&a));
          return 0;
       }
       mp_mul_2(&a, &a);
@ -110,7 +277,8 @@ int main(void)
 #endif

 /* test mp_reduce_2k */
-#if 0
+#ifdef TESTING
+   printf("Testing mp_reduce_2k...\n");
   for (cnt = 3; cnt <= 384; ++cnt) {
       mp_digit tmp;
       mp_2expt(&a, cnt);
@ -137,7 +305,8 @@ int main(void)


 /* test mp_div_3  */
-#if 0
+#ifdef TESTING
+   printf("Testing mp_div_3...\n");
   mp_set(&d, 3);
   for (cnt = 0; cnt < 1000000; ) {
      mp_digit r1, r2;
@ -155,7 +324,8 @@ int main(void)
 #endif

 /* test the DR reduction */
-#if 0
+#ifdef TESTING
+   printf("testing mp_dr_reduce...\n");
   for (cnt = 2; cnt < 128; cnt++) {
       printf("%d digit modulus\n", cnt);
       mp_grow(&a, cnt);
@ -190,7 +360,13 @@ int main(void)
 #ifdef TIMER
      /* temp. turn off TOOM */
      TOOM_MUL_CUTOFF = TOOM_SQR_CUTOFF = 100000;
-      printf("CLOCKS_PER_SEC == %lu\n", CLOCKS_PER_SEC);
+
+      reset();
+      sleep(1);
+      CLK_PER_SEC = rdtsc();
+
+      printf("CLK_PER_SEC == %lu\n", CLK_PER_SEC);
+      

      log = fopen("logs/add.log", "w");
      for (cnt = 8; cnt <= 128; cnt += 8) {
@ -202,10 +378,10 @@ int main(void)
         do {
            DO(mp_add(&a,&b,&c));
            rr += 16;
-         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
-         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+         printf("Adding\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt); fflush(log);
      }
      fclose(log);

@ -219,10 +395,10 @@ int main(void)
         do {
            DO(mp_sub(&a,&b,&c));
            rr += 16;
-         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
-         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+         printf("Subtracting\t\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);

@ -237,7 +413,7 @@ mult_test:
      KARATSUBA_SQR_CUTOFF = (ix==0)?9999:old_kara_s;

      log = fopen((ix==0)?"logs/mult.log":"logs/mult_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 16) {
+      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         mp_rand(&b, cnt);
@ -246,15 +422,15 @@ mult_test:
         do {
            DO(mp_mul(&a, &b, &c));
            rr += 16;
-         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
-         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+         printf("Multiplying\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);

      log = fopen((ix==0)?"logs/sqr.log":"logs/sqr_kara.log", "w");
-      for (cnt = 32; cnt <= 288; cnt += 16) {
+      for (cnt = 32; cnt <= 288; cnt += 8) {
         SLEEP;
         mp_rand(&a, cnt);
         reset();
@ -262,14 +438,15 @@ mult_test:
         do {
            DO(mp_sqr(&a, &b));
            rr += 16;
-         } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+         } while (rdtsc() < (CLK_PER_SEC * 2));
         tt = rdtsc();
-         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+         printf("Squaring\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+         fprintf(log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);  fflush(log);
      }
      fclose(log);

   }
+expt_test:
  {
      char *primes[] = {
         /* 2K moduli mersenne primes */
@ -299,14 +476,12 @@ mult_test:
         "1214855636816562637502584060163403830270705000634713483015101384881871978446801224798536155406895823305035467591632531067547890948695117172076954220727075688048751022421198712032848890056357845974246560748347918630050853933697792254955890439720297560693579400297062396904306270145886830719309296352765295712183040773146419022875165382778007040109957609739589875590885701126197906063620133954893216612678838507540777138437797705602453719559017633986486649523611975865005712371194067612263330335590526176087004421363598470302731349138773205901447704682181517904064735636518462452242791676541725292378925568296858010151852326316777511935037531017413910506921922450666933202278489024521263798482237150056835746454842662048692127173834433089016107854491097456725016327709663199738238442164843147132789153725513257167915555162094970853584447993125488607696008169807374736711297007473812256272245489405898470297178738029484459690836250560495461579533254473316340608217876781986188705928270735695752830825527963838355419762516246028680280988020401914551825487349990306976304093109384451438813251211051597392127491464898797406789175453067960072008590614886532333015881171367104445044718144312416815712216611576221546455968770801413440778423979",
         NULL
      };
-expt_test:
   log = fopen("logs/expt.log", "w");
   logb = fopen("logs/expt_dr.log", "w");
   logc = fopen("logs/expt_2k.log", "w");
   for (n = 0; primes[n]; n++) {
      SLEEP;
      mp_read_radix(&a, primes[n], 10);
-         printf("Different (%d)!!!\n", mp_count_bits(&a));
      mp_zero(&b);
      for (rr = 0; rr < mp_count_bits(&a); rr++) {
         mp_mul_2(&b, &b);
@ -321,7 +496,7 @@ expt_test:
      do {
         DO(mp_exptmod(&c, &b, &a, &d));
         rr += 16;
-      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+      } while (rdtsc() < (CLK_PER_SEC * 2));
      tt = rdtsc();
      mp_sub_d(&a, 1, &e);
      mp_sub(&e, &b, &b);
@ -332,8 +507,8 @@ expt_test:
         draw(&d);
         exit(0);
      }
-      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+      printf("Exponentiating\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+      fprintf((n < 6) ? logc : (n < 13) ? logb : log, "%d %9llu\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt);
   }
   }
   fclose(log);
@ -356,15 +531,15 @@ expt_test:
      do {
         DO(mp_invmod(&b, &a, &c));
         rr += 16;
-      } while (rdtsc() < (CLOCKS_PER_SEC * 2));
+      } while (rdtsc() < (CLK_PER_SEC * 2));
      tt = rdtsc();
      mp_mulmod(&b, &c, &a, &d);
      if (mp_cmp_d(&d, 1) != MP_EQ) {
         printf("Failed to invert\n");
         return 0;
      }
-      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLOCKS_PER_SEC)/tt, tt);
-      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLOCKS_PER_SEC)/tt);
+      printf("Inverting mod\t%4d-bit => %9llu/sec, %9llu ticks\n", mp_count_bits(&a), (((ulong64)rr)*CLK_PER_SEC)/tt, tt);
+      fprintf(log, "%d %9llu\n", cnt*DIGIT_BIT, (((ulong64)rr)*CLK_PER_SEC)/tt);
   }
   fclose(log);

--- a/etc/mont.c
+++ b/etc/mont.c
@ -1,46 +1,46 @@
-/* tests the montgomery routines */
-#include <tommath.h>
-
-int main(void)
-{
-   mp_int modulus, R, p, pp;
-   mp_digit mp;
-   long x, y;
-
-   srand(time(NULL));
-   mp_init_multi(&modulus, &R, &p, &pp, NULL);
-
-   /* loop through various sizes */
-   for (x = 4; x < 256; x++) {
-       printf("DIGITS == %3ld...", x); fflush(stdout);
-       
-       /* make up the odd modulus */
-       mp_rand(&modulus, x);
-       modulus.dp[0] |= 1;
-       
-       /* now find the R value */
-       mp_montgomery_calc_normalization(&R, &modulus);
-       mp_montgomery_setup(&modulus, &mp);
-       
-       /* now run through a bunch tests */
-       for (y = 0; y < 1000; y++) {
-           mp_rand(&p, x/2);        /* p = random */
-           mp_mul(&p, &R, &pp);     /* pp = R * p */
-           mp_montgomery_reduce(&pp, &modulus, mp);
-           
-           /* should be equal to p */
-           if (mp_cmp(&pp, &p) != MP_EQ) {
-              printf("FAILURE!\n");
-              exit(-1);
-           }
-       }
-       printf("PASSED\n");
-    }
-    
-    return 0;
-}
-
-
-
-
-
+/* tests the montgomery routines */
+#include <tommath.h>
+
+int main(void)
+{
+   mp_int modulus, R, p, pp;
+   mp_digit mp;
+   long x, y;
+
+   srand(time(NULL));
+   mp_init_multi(&modulus, &R, &p, &pp, NULL);
+
+   /* loop through various sizes */
+   for (x = 4; x < 256; x++) {
+       printf("DIGITS == %3ld...", x); fflush(stdout);
+       
+       /* make up the odd modulus */
+       mp_rand(&modulus, x);
+       modulus.dp[0] |= 1;
+       
+       /* now find the R value */
+       mp_montgomery_calc_normalization(&R, &modulus);
+       mp_montgomery_setup(&modulus, &mp);
+       
+       /* now run through a bunch tests */
+       for (y = 0; y < 1000; y++) {
+           mp_rand(&p, x/2);        /* p = random */
+           mp_mul(&p, &R, &pp);     /* pp = R * p */
+           mp_montgomery_reduce(&pp, &modulus, mp);
+           
+           /* should be equal to p */
+           if (mp_cmp(&pp, &p) != MP_EQ) {
+              printf("FAILURE!\n");
+              exit(-1);
+           }
+       }
+       printf("PASSED\n");
+    }
+    
+    return 0;
+}
+
+
+
+
+
--- a/etclib/poly.c
+++ b/etclib/poly.c
@ -1,302 +0,0 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision 
- * integer arithmetic as well as number theoretic functionality.
- *
- * This file "poly.c" provides GF(p^k) functionality on top of the 
- * libtommath library.
- * 
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.  
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
- */
-#include "poly.h"
-
-#undef MIN
-#define MIN(x,y) ((x)<(y)?(x):(y))
-#undef MAX
-#define MAX(x,y) ((x)>(y)?(x):(y))
-
-static void s_free(mp_poly *a)
-{
-   int k;
-   for (k = 0; k < a->alloc; k++) {
-       mp_clear(&(a->co[k]));
-   }
-}
-
-static int s_setup(mp_poly *a, int low, int high)
-{
-   int res, k, j;
-   for (k = low; k < high; k++) {
-       if ((res = mp_init(&(a->co[k]))) != MP_OKAY) {
-          for (j = low; j < k; j++) {
-             mp_clear(&(a->co[j]));
-          }
-          return MP_MEM;
-       }
-   }
-   return MP_OKAY;
-}   
-
-int mp_poly_init(mp_poly *a, mp_int *cha)
-{
-   return mp_poly_init_size(a, cha, MP_POLY_PREC);
-}
-
-/* init a poly of a given (size) degree */
-int mp_poly_init_size(mp_poly *a, mp_int *cha, int size)
-{
-   int res;
-   
-   /* allocate array of mp_ints for coefficients */
-   a->co = malloc(size * sizeof(mp_int));
-   if (a->co == NULL) {
-      return MP_MEM;
-   }
-   a->used  = 0;
-   a->alloc = size;
-   
-   /* now init the range */
-   if ((res = s_setup(a, 0, size)) != MP_OKAY) {
-      free(a->co);
-      return res;
-   }
-   
-   /* copy characteristic */
-   if ((res = mp_init_copy(&(a->cha), cha)) != MP_OKAY) {
-      s_free(a);
-      free(a->co);
-      return res;
-   }
-   
-   /* return ok at this point */
-   return MP_OKAY;
-}
-
-/* grow the size of a poly */
-static int mp_poly_grow(mp_poly *a, int size)
-{
-  int res;
-  
-  if (size > a->alloc) {
-     /* resize the array of coefficients */
-     a->co = realloc(a->co, sizeof(mp_int) * size);
-     if (a->co == NULL) {
-        return MP_MEM;
-     }
-     
-     /* now setup the coefficients */
-     if ((res = s_setup(a, a->alloc, a->alloc + size)) != MP_OKAY) {
-        return res;
-     }
-     
-     a->alloc += size;
-  }
-  return MP_OKAY;
-}
-
-/* copy, b = a */
-int mp_poly_copy(mp_poly *a, mp_poly *b)
-{
-   int res, k;
-   
-   /* resize b */
-   if ((res = mp_poly_grow(b, a->used)) != MP_OKAY) {
-      return res;
-   }
-   
-   /* now copy the used part */
-   b->used = a->used;
-   
-   /* now the cha */
-   if ((res = mp_copy(&(a->cha), &(b->cha))) != MP_OKAY) {
-      return res;
-   }
-   
-   /* now all the coefficients */
-   for (k = 0; k < b->used; k++) {
-       if ((res = mp_copy(&(a->co[k]), &(b->co[k]))) != MP_OKAY) {
-          return res;
-       }
-   }
-   
-   /* now zero the top */
-   for (k = b->used; k < b->alloc; k++) {
-       mp_zero(&(b->co[k]));
-   }
-   
-   return MP_OKAY;
-}
-
-/* init from a copy, a = b */
-int mp_poly_init_copy(mp_poly *a, mp_poly *b)
-{
-   int res;
-   
-   if ((res = mp_poly_init(a, &(b->cha))) != MP_OKAY) {
-      return res;
-   }
-   return mp_poly_copy(b, a);
-}
-
-/* free a poly from ram */
-void mp_poly_clear(mp_poly *a)
-{
-   s_free(a);
-   mp_clear(&(a->cha));
-   free(a->co);
-   
-   a->co = NULL;
-   a->used = a->alloc = 0;
-}
-
-/* exchange two polys */
-void mp_poly_exch(mp_poly *a, mp_poly *b)
-{
-   mp_poly t;
-   t = *a; *a = *b; *b = t;
-}
-
-/* clamp the # of used digits */
-static void mp_poly_clamp(mp_poly *a)
-{
-   while (a->used > 0 && mp_cmp_d(&(a->co[a->used]), 0) == MP_EQ) --(a->used);
-}  
-
-/* add two polynomials, c(x) = a(x) + b(x) */
-int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c)
-{
-   mp_poly t, *x, *y;
-   int res, k;
-   
-   /* ensure char's are the same */
-   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
-      return MP_VAL;
-   }
-   
-   /* now figure out the sizes such that x is the 
-      largest degree poly and y is less or equal in degree 
-    */
-   if (a->used > b->used) {
-      x = a;
-      y = b;
-   } else {
-      x = b;
-      y = a;
-   }
-   
-   /* now init the result to be a copy of the largest */
-   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
-      return res;
-   }
-   
-   /* now add the coeffcients of the smaller one */
-   for (k = 0; k < y->used; k++) {
-       if ((res = mp_addmod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
-          goto __T;
-       }
-   }
-   
-   mp_poly_clamp(&t);
-   mp_poly_exch(&t, c);
-   res = MP_OKAY;
-       
-__T:  mp_poly_clear(&t);
-   return res;
-}
-
-/* subtracts two polynomials, c(x) = a(x) - b(x) */
-int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c)
-{
-   mp_poly t, *x, *y;
-   int res, k;
-   
-   /* ensure char's are the same */
-   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
-      return MP_VAL;
-   }
-   
-   /* now figure out the sizes such that x is the 
-      largest degree poly and y is less or equal in degree 
-    */
-   if (a->used > b->used) {
-      x = a;
-      y = b;
-   } else {
-      x = b;
-      y = a;
-   }
-   
-   /* now init the result to be a copy of the largest */
-   if ((res = mp_poly_init_copy(&t, x)) != MP_OKAY) {
-      return res;
-   }
-   
-   /* now add the coeffcients of the smaller one */
-   for (k = 0; k < y->used; k++) {
-       if ((res = mp_submod(&(a->co[k]), &(b->co[k]), &(a->cha), &(t.co[k]))) != MP_OKAY) {
-          goto __T;
-       }
-   }
-   
-   mp_poly_clamp(&t);
-   mp_poly_exch(&t, c);
-   res = MP_OKAY;
-       
-__T:  mp_poly_clear(&t);
-   return res;
-}
-
-/* multiplies two polynomials, c(x) = a(x) * b(x) */
-int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c)
-{
-   mp_poly t;
-   mp_int  tt;
-   int res, pa, pb, ix, iy;
-   
-   /* ensure char's are the same */
-   if (mp_cmp(&(a->cha), &(b->cha)) != MP_EQ) {
-      return MP_VAL;
-   }
-   
-   /* degrees of a and b */
-   pa = a->used;
-   pb = b->used;
-   
-   /* now init the temp polynomial to be of degree pa+pb */
-   if ((res = mp_poly_init_size(&t, &(a->cha), pa+pb)) != MP_OKAY) {
-      return res;
-   }
-   
-   /* now init our temp int */
-   if ((res = mp_init(&tt)) != MP_OKAY) {
-      goto __T;
-   }
-   
-   /* now loop through all the digits */
-   for (ix = 0; ix < pa; ix++) {
-       for (iy = 0; iy < pb; iy++) {
-          if ((res = mp_mul(&(a->co[ix]), &(b->co[iy]), &tt)) != MP_OKAY) {
-             goto __TT;
-          }
-          if ((res = mp_addmod(&tt, &(t.co[ix+iy]), &(a->cha), &(t.co[ix+iy]))) != MP_OKAY) {
-             goto __TT;
-          }
-       }
-   }
-   
-   mp_poly_clamp(&t);
-   mp_poly_exch(&t, c);
-   res = MP_OKAY;
-   
-__TT: mp_clear(&tt);
-__T:  mp_poly_clear(&t);
-   return res;
-}
-
--- a/etclib/poly.h
+++ b/etclib/poly.h
@ -1,73 +0,0 @@
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is library that provides for multiple-precision 
- * integer arithmetic as well as number theoretic functionality.
- *
- * This file "poly.h" provides GF(p^k) functionality on top of the 
- * libtommath library.
- * 
- * The library is designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with 
- * additional optimizations in place.  
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://libtommath.iahu.ca
- */
-
-#ifndef POLY_H_
-#define POLY_H_
-
-#include "bn.h"
-
-/* a mp_poly is basically a derived "class" of a mp_int
- * it uses the same technique of growing arrays via 
- * used/alloc parameters except the base unit or "digit"
- * is in fact a mp_int.  These hold the coefficients
- * of the polynomial 
- */
-typedef struct {
-    int    used,    /* coefficients used */
-           alloc;   /* coefficients allocated (and initialized) */
-    mp_int *co,     /* coefficients */
-           cha;     /* characteristic */
-    
-} mp_poly;
-
-
-#define MP_POLY_PREC     16             /* default coefficients allocated */
-
-/* init a poly */
-int mp_poly_init(mp_poly *a, mp_int *cha);
-
-/* init a poly of a given (size) degree  */
-int mp_poly_init_size(mp_poly *a, mp_int *cha, int size);
-
-/* copy, b = a */
-int mp_poly_copy(mp_poly *a, mp_poly *b);
-
-/* init from a copy, a = b */
-int mp_poly_init_copy(mp_poly *a, mp_poly *b);
-
-/* free a poly from ram */
-void mp_poly_clear(mp_poly *a);
-
-/* exchange two polys */
-void mp_poly_exch(mp_poly *a, mp_poly *b);
-
-/* ---> Basic Arithmetic <--- */
-
-/* add two polynomials, c(x) = a(x) + b(x) */
-int mp_poly_add(mp_poly *a, mp_poly *b, mp_poly *c);
-
-/* subtracts two polynomials, c(x) = a(x) - b(x) */
-int mp_poly_sub(mp_poly *a, mp_poly *b, mp_poly *c);
-
-/* multiplies two polynomials, c(x) = a(x) * b(x) */
-int mp_poly_mul(mp_poly *a, mp_poly *b, mp_poly *c);
-
-
-
-#endif
-
--- a/logs/add.log
+++ b/logs/add.log
@ -1,16 +1,16 @@
-224   8622881
-448   7241417
-672   5844712
-896   4938016
-1120   4256543
-1344   3728000
-1568   3328263
-1792   3012161
-2016   2743688
-2240   2512095
-2464   2234464
-2688   1960139
-2912   2013395
-3136   1879636
-3360   1756301
-3584   1680982
+224  20297071
+448  15151383
+672  13088682
+896  11111587
+1120   9240621
+1344   8221878
+1568   7227434
+1792   6718051
+2016   6042524
+2240   5685200
+2464   5240465
+2688   4818032
+2912   4412794
+3136   4155883
+3360   3927078
+3584   3722138
--- a/logs/addsub.png
+++ b/logs/addsub.png
--- a/logs/expt.log
+++ b/logs/expt.log
@ -0,0 +1,7 @@
+513       745
+769       282
+1025       130
+2049        20
+2561        11
+3073         6
+4097         2
--- a/logs/expt.png
+++ b/logs/expt.png
--- a/logs/expt_2k.log
+++ b/logs/expt_2k.log
@ -0,0 +1,6 @@
+521       783
+607       585
+1279       138
+2203        39
+3217        15
+4253         6
--- a/logs/expt_dr.log
+++ b/logs/expt_dr.log
@ -0,0 +1,7 @@
+532      1296
+784       551
+1036       283
+1540       109
+2072        52
+3080        18
+4116         7
--- a/logs/graphs.dem
+++ b/logs/graphs.dem
@ -1,4 +1,4 @@
-set terminal png color
+set terminal png
 set size 1.75
 set ylabel "Operations per Second"
 set xlabel "Operand size (bits)"
--- a/logs/index.html
+++ b/logs/index.html
@ -1,24 +1,24 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
 </html>
--- a/logs/invmod.log
+++ b/logs/invmod.log
@ -1,32 +1,32 @@
-112     14936
-224      7208
-336      6864
-448      5000
-560      3648
-672      1832
-784      1480
-896      1232
-1008      1010
-1120      1360
-1232       728
-1344       632
-1456       544
-1568       800
-1680       704
-1792       396
-1904       584
-2016       528
-2128       483
-2240       448
-2352       250
-2464       378
-2576       350
-2688       198
-2800       300
-2912       170
-3024       265
-3136       150
-3248       142
-3360       134
-3472       126
-3584       118
+112     17364
+224      8643
+336      8867
+448      6228
+560      4737
+672      2259
+784      2899
+896      1497
+1008      1238
+1120      1010
+1232       870
+1344      1265
+1456      1102
+1568       981
+1680       539
+1792       484
+1904       722
+2016       392
+2128       604
+2240       551
+2352       511
+2464       469
+2576       263
+2688       247
+2800       227
+2912       354
+3024       336
+3136       312
+3248       296
+3360       166
+3472       155
+3584       248
--- a/logs/invmod.png
+++ b/logs/invmod.png
--- a/logs/k7/index.html
+++ b/logs/k7/index.html
@ -1,24 +1,24 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
 </html>
--- a/logs/mult.log
+++ b/logs/mult.log
@ -1,17 +1,33 @@
-896    348504
-1344    165040
-1792     98696
-2240     65400
-2688     46672
-3136     34968
-3584     27144
-4032     21648
-4480     17672
-4928     14768
-5376     12416
-5824     10696
-6272      9184
-6720      8064
-7168      1896
-7616      1680
-8064      1504
+920    374785
+1142    242737
+1371    176704
+1596    134341
+1816    105537
+2044     85089
+2268     70051
+2490     58671
+2716     49851
+2937     42881
+3162     37288
+3387     32697
+3608     28915
+3836     25759
+4057     23088
+4284     20800
+4508     18827
+4730     17164
+4956     15689
+5180     14397
+5398     13260
+5628     12249
+5852     11346
+6071     10537
+6298      9812
+6522      9161
+6742      8572
+6971      8038
+7195      2915
+7419      2744
+7644      2587
+7866      2444
+8090      2311
--- a/logs/mult.png
+++ b/logs/mult.png
--- a/logs/mult_kara.log
+++ b/logs/mult_kara.log
@ -1,17 +1,33 @@
-896    301784
-1344    141568
-1792     84592
-2240     55864
-2688     39576
-3136     30088
-3584     24032
-4032     19760
-4480     16536
-4928     13376
-5376     11880
-5824     10256
-6272      9160
-6720      8208
-7168      7384
-7616      6664
-8064      6112
+924    374171
+1147    243163
+1371    177111
+1596    134465
+1819    105619
+2044     85145
+2266     70086
+2488     58717
+2715     49869
+2939     42894
+3164     37389
+3387     33510
+3610     29993
+3836     27205
+4060     24751
+4281     22576
+4508     20670
+4732     19019
+4954     17527
+5180     16217
+5404     15044
+5624     14003
+5849     13051
+6076     12067
+6300     11438
+6524     10772
+6748     10298
+6972      9715
+7195      9330
+7416      8836
+7644      8465
+7864      8042
+8091      7735
--- a/logs/p4/index.html
+++ b/logs/p4/index.html
@ -1,24 +1,24 @@
-<html>
-<head>
-<title>LibTomMath Log Plots</title>
-</head>
-<body>
-
-<h1>Addition and Subtraction</h1>
-<center><img src=addsub.png></center>
-<hr>
-
-<h1>Multipliers</h1>
-<center><img src=mult.png></center>
-<hr>
-
-<h1>Exptmod</h1>
-<center><img src=expt.png></center>
-<hr>
-
-<h1>Modular Inverse</h1>
-<center><img src=invmod.png></center>
-<hr>
-
-</body>
+<html>
+<head>
+<title>LibTomMath Log Plots</title>
+</head>
+<body>
+
+<h1>Addition and Subtraction</h1>
+<center><img src=addsub.png></center>
+<hr>
+
+<h1>Multipliers</h1>
+<center><img src=mult.png></center>
+<hr>
+
+<h1>Exptmod</h1>
+<center><img src=expt.png></center>
+<hr>
+
+<h1>Modular Inverse</h1>
+<center><img src=invmod.png></center>
+<hr>
+
+</body>
 </html>
--- a/logs/sqr.log
+++ b/logs/sqr.log
@ -1,17 +1,33 @@
-911    167013
-1359     83796
-1807     50308
-2254     33637
-2703     24067
-3151     17997
-3599      5751
-4047      4561
-4490      3714
-4943      3067
-5391      2597
-5839      2204
-6286      1909
-6735      1637
-7183      1461
-7631      1302
-8078      1158
+922    471095
+1147    337137
+1366    254327
+1596    199732
+1819    161225
+2044    132852
+2268    111493
+2490     94864
+2715     81745
+2940     71187
+3162     62575
+3387     55418
+3612     14540
+3836     12944
+4060     11627
+4281     10546
+4508      9502
+4730      8688
+4954      7937
+5180      7273
+5402      6701
+5627      6189
+5850      5733
+6076      5310
+6300      4933
+6522      4631
+6748      4313
+6971      4064
+7196      3801
+7420      3576
+7642      3388
+7868      3191
+8092      3020
--- a/logs/sqr_kara.log
+++ b/logs/sqr_kara.log
@ -1,17 +1,33 @@
-910    165312
-1358     84355
-1806     50316
-2255     33661
-2702     24027
-3151     18068
-3599     14721
-4046     12101
-4493     10112
-4942      8591
-5390      7364
-5839      6398
-6285      5607
-6735      4952
-7182      4625
-7631      4193
-8079      3810
+922    470930
+1148    337217
+1372    254433
+1596    199827
+1820    161204
+2043    132871
+2267    111522
+2488     94932
+2714     81814
+2939     71231
+3164     62616
+3385     55467
+3611     44426
+3836     40695
+4060     37391
+4283     34371
+4508     31779
+4732     29499
+4956     27426
+5177     25598
+5403     23944
+5628     22416
+5851     21052
+6076     19781
+6299     18588
+6523     17539
+6746     16618
+6972     15705
+7196     13582
+7420     13004
+7643     12496
+7868     11963
+8092     11497
--- a/logs/sub.log
+++ b/logs/sub.log
@ -1,16 +1,16 @@
-224  10295756
-448   7577910
-672   6279588
-896   5345182
-1120   4646989
-1344   4101759
-1568   3685447
-1792   3337497
-2016   3051095
-2240   2811900
-2464   2605371
-2688   2420561
-2912   2273174
-3136   2134662
-3360   2014354
-3584   1901723
+224  16370431
+448  13327848
+672  11009401
+896   9125342
+1120   7930419
+1344   7114040
+1568   6506998
+1792   5899346
+2016   5435327
+2240   5038931
+2464   4696364
+2688   4425678
+2912   4134476
+3136   3913280
+3360   3692536
+3584   3505219
--- a/26
+++ b/26
@ -1,7 +1,7 @@
 #Makefile for GCC
 #
 #Tom St Denis
-CFLAGS  +=  -I./ -Wall -W -Wshadow 
+CFLAGS  +=  -I./ -Wall -W -Wshadow -Wsign-compare

 #for speed 
 CFLAGS += -O3 -funroll-loops
@ -12,7 +12,7 @@ CFLAGS += -O3 -funroll-loops
 #x86 optimizations [should be valid for any GCC install though]
 CFLAGS  += -fomit-frame-pointer

-VERSION=0.29
+VERSION=0.30

 default: libtommath.a

@ -50,7 +50,9 @@ bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
-bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_prime_random.o bn_prime_sizes_tab.o bn_mp_exteuclid.o
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o bn_mp_init_set.o \
+bn_mp_init_set_int.o

 libtommath.a:  $(OBJECTS)
 	$(AR) $(ARFLAGS) libtommath.a $(OBJECTS)
@ -64,6 +66,8 @@ install: libtommath.a

 test: libtommath.a demo/demo.o
 	$(CC) demo/demo.o libtommath.a -o test
+	
+mtest: test	
 	cd mtest ; $(CC) $(CFLAGS) mtest.c -o mtest -s
        
 timing: libtommath.a
@ -75,23 +79,18 @@ docdvi: tommath.src
 	echo "hello" > tommath.ind
 	perl booker.pl
 	latex tommath > /dev/null
+	latex tommath > /dev/null
 	makeindex tommath
 	latex tommath > /dev/null
-		
+
 # poster, makes the single page PDF poster
 poster: poster.tex
 	pdflatex poster
 	rm -f poster.aux poster.log 

-# makes the LTM book PS/PDF file, requires tetex, cleans up the LaTeX temp files
-docs:	
-	cd pics ; make pdfes
-	echo "hello" > tommath.ind
-	perl booker.pl PDF 
-	latex tommath > /dev/null
-	makeindex tommath
-	latex tommath > /dev/null
-	pdflatex tommath
+# makes the LTM book PDF file, requires tetex, cleans up the LaTeX temp files
+docs:   docdvi
+	dvipdf tommath
 	rm -f tommath.log tommath.aux tommath.dvi tommath.idx tommath.toc tommath.lof tommath.ind tommath.ilg
 	cd pics ; make clean
 	
@ -99,6 +98,7 @@ docs:
 mandvi: bn.tex
 	echo "hello" > bn.ind
 	latex bn > /dev/null
+	latex bn > /dev/null
 	makeindex bn
 	latex bn > /dev/null

--- a/makefile.bcc
+++ b/makefile.bcc
@ -29,12 +29,13 @@ bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
-bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_prime_random.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj

 TARGET = libtommath.lib

 $(TARGET): $(OBJECTS)

-.c.objbj:
+.c.objbjbjbj:
 	$(CC) $(CFLAGS) $<
 	$(LIB) $(TARGET) -+$@
--- a/makefile.cygwin_dll
+++ b/makefile.cygwin_dll
@ -34,7 +34,8 @@ bn_mp_toom_mul.o bn_mp_toom_sqr.o bn_mp_div_3.o bn_s_mp_exptmod.o \
 bn_mp_reduce_2k.o bn_mp_reduce_is_2k.o bn_mp_reduce_2k_setup.o \
 bn_mp_radix_smap.o bn_mp_read_radix.o bn_mp_toradix.o bn_mp_radix_size.o \
 bn_mp_fread.o bn_mp_fwrite.o bn_mp_cnt_lsb.o bn_error.o \
-bn_mp_init_multi.o bn_mp_clear_multi.o bn_mp_prime_random.o bn_prime_sizes_tab.o bn_mp_exteuclid.o
+bn_mp_init_multi.o bn_mp_clear_multi.o bn_prime_sizes_tab.o bn_mp_exteuclid.o bn_mp_toradix_n.o \
+bn_mp_prime_random_ex.o bn_mp_get_int.o bn_mp_sqrt.o bn_mp_is_square.o

 # make a Windows DLL via Cygwin
 windll:  $(OBJECTS)
--- a/makefile.msvc
+++ b/makefile.msvc
@ -28,7 +28,8 @@ bn_mp_toom_mul.obj bn_mp_toom_sqr.obj bn_mp_div_3.obj bn_s_mp_exptmod.obj \
 bn_mp_reduce_2k.obj bn_mp_reduce_is_2k.obj bn_mp_reduce_2k_setup.obj \
 bn_mp_radix_smap.obj bn_mp_read_radix.obj bn_mp_toradix.obj bn_mp_radix_size.obj \
 bn_mp_fread.obj bn_mp_fwrite.obj bn_mp_cnt_lsb.obj bn_error.obj \
-bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_mp_prime_random.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj
+bn_mp_init_multi.obj bn_mp_clear_multi.obj bn_prime_sizes_tab.obj bn_mp_exteuclid.obj bn_mp_toradix_n.obj \
+bn_mp_prime_random_ex.obj bn_mp_get_int.obj bn_mp_sqrt.obj bn_mp_is_square.obj

 library: $(OBJECTS)
 	lib /out:tommath.lib $(OBJECTS)
--- a/mtest/logtab.h
+++ b/mtest/logtab.h
@ -1,20 +1,20 @@
-const float s_logv_2[] = {
-   0.000000000, 0.000000000, 1.000000000, 0.630929754, 	/*  0  1  2  3 */
-   0.500000000, 0.430676558, 0.386852807, 0.356207187, 	/*  4  5  6  7 */
-   0.333333333, 0.315464877, 0.301029996, 0.289064826, 	/*  8  9 10 11 */
-   0.278942946, 0.270238154, 0.262649535, 0.255958025, 	/* 12 13 14 15 */
-   0.250000000, 0.244650542, 0.239812467, 0.235408913, 	/* 16 17 18 19 */
-   0.231378213, 0.227670249, 0.224243824, 0.221064729, 	/* 20 21 22 23 */
-   0.218104292, 0.215338279, 0.212746054, 0.210309918, 	/* 24 25 26 27 */
-   0.208014598, 0.205846832, 0.203795047, 0.201849087, 	/* 28 29 30 31 */
-   0.200000000, 0.198239863, 0.196561632, 0.194959022, 	/* 32 33 34 35 */
-   0.193426404, 0.191958720, 0.190551412, 0.189200360, 	/* 36 37 38 39 */
-   0.187901825, 0.186652411, 0.185449023, 0.184288833, 	/* 40 41 42 43 */
-   0.183169251, 0.182087900, 0.181042597, 0.180031327, 	/* 44 45 46 47 */
-   0.179052232, 0.178103594, 0.177183820, 0.176291434, 	/* 48 49 50 51 */
-   0.175425064, 0.174583430, 0.173765343, 0.172969690, 	/* 52 53 54 55 */
-   0.172195434, 0.171441601, 0.170707280, 0.169991616, 	/* 56 57 58 59 */
-   0.169293808, 0.168613099, 0.167948779, 0.167300179, 	/* 60 61 62 63 */
-   0.166666667
-};
-
+const float s_logv_2[] = {
+   0.000000000, 0.000000000, 1.000000000, 0.630929754, 	/*  0  1  2  3 */
+   0.500000000, 0.430676558, 0.386852807, 0.356207187, 	/*  4  5  6  7 */
+   0.333333333, 0.315464877, 0.301029996, 0.289064826, 	/*  8  9 10 11 */
+   0.278942946, 0.270238154, 0.262649535, 0.255958025, 	/* 12 13 14 15 */
+   0.250000000, 0.244650542, 0.239812467, 0.235408913, 	/* 16 17 18 19 */
+   0.231378213, 0.227670249, 0.224243824, 0.221064729, 	/* 20 21 22 23 */
+   0.218104292, 0.215338279, 0.212746054, 0.210309918, 	/* 24 25 26 27 */
+   0.208014598, 0.205846832, 0.203795047, 0.201849087, 	/* 28 29 30 31 */
+   0.200000000, 0.198239863, 0.196561632, 0.194959022, 	/* 32 33 34 35 */
+   0.193426404, 0.191958720, 0.190551412, 0.189200360, 	/* 36 37 38 39 */
+   0.187901825, 0.186652411, 0.185449023, 0.184288833, 	/* 40 41 42 43 */
+   0.183169251, 0.182087900, 0.181042597, 0.180031327, 	/* 44 45 46 47 */
+   0.179052232, 0.178103594, 0.177183820, 0.176291434, 	/* 48 49 50 51 */
+   0.175425064, 0.174583430, 0.173765343, 0.172969690, 	/* 52 53 54 55 */
+   0.172195434, 0.171441601, 0.170707280, 0.169991616, 	/* 56 57 58 59 */
+   0.169293808, 0.168613099, 0.167948779, 0.167300179, 	/* 60 61 62 63 */
+   0.166666667
+};
+
--- a/mtest/mpi-config.h
+++ b/mtest/mpi-config.h
@ -1,86 +1,86 @@
-/* Default configuration for MPI library */
-/* $ID$ */
-
-#ifndef MPI_CONFIG_H_
-#define MPI_CONFIG_H_
-
-/*
-  For boolean options, 
-  0 = no
-  1 = yes
-
-  Other options are documented individually.
-
- */
-
-#ifndef MP_IOFUNC
-#define MP_IOFUNC     0  /* include mp_print() ?                */
-#endif
-
-#ifndef MP_MODARITH
-#define MP_MODARITH   1  /* include modular arithmetic ?        */
-#endif
-
-#ifndef MP_NUMTH
-#define MP_NUMTH      1  /* include number theoretic functions? */
-#endif
-
-#ifndef MP_LOGTAB
-#define MP_LOGTAB     1  /* use table of logs instead of log()? */
-#endif
-
-#ifndef MP_MEMSET
-#define MP_MEMSET     1  /* use memset() to zero buffers?       */
-#endif
-
-#ifndef MP_MEMCPY
-#define MP_MEMCPY     1  /* use memcpy() to copy buffers?       */
-#endif
-
-#ifndef MP_CRYPTO
-#define MP_CRYPTO     1  /* erase memory on free?               */
-#endif
-
-#ifndef MP_ARGCHK
-/*
-  0 = no parameter checks
-  1 = runtime checks, continue execution and return an error to caller
-  2 = assertions; dump core on parameter errors
- */
-#define MP_ARGCHK     2  /* how to check input arguments        */
-#endif
-
-#ifndef MP_DEBUG
-#define MP_DEBUG      0  /* print diagnostic output?            */
-#endif
-
-#ifndef MP_DEFPREC
-#define MP_DEFPREC    64 /* default precision, in digits        */
-#endif
-
-#ifndef MP_MACRO
-#define MP_MACRO      1  /* use macros for frequent calls?      */
-#endif
-
-#ifndef MP_SQUARE
-#define MP_SQUARE     1  /* use separate squaring code?         */
-#endif
-
-#ifndef MP_PTAB_SIZE
-/*
-  When building mpprime.c, we build in a table of small prime
-  values to use for primality testing.  The more you include,
-  the more space they take up.  See primes.c for the possible
-  values (currently 16, 32, 64, 128, 256, and 6542)
- */
-#define MP_PTAB_SIZE  128  /* how many built-in primes?         */
-#endif
-
-#ifndef MP_COMPAT_MACROS
-#define MP_COMPAT_MACROS 1   /* define compatibility macros?    */
-#endif
-
-#endif /* ifndef MPI_CONFIG_H_ */
-
-
-/* crc==3287762869, version==2, Sat Feb 02 06:43:53 2002 */
+/* Default configuration for MPI library */
+/* $ID$ */
+
+#ifndef MPI_CONFIG_H_
+#define MPI_CONFIG_H_
+
+/*
+  For boolean options, 
+  0 = no
+  1 = yes
+
+  Other options are documented individually.
+
+ */
+
+#ifndef MP_IOFUNC
+#define MP_IOFUNC     0  /* include mp_print() ?                */
+#endif
+
+#ifndef MP_MODARITH
+#define MP_MODARITH   1  /* include modular arithmetic ?        */
+#endif
+
+#ifndef MP_NUMTH
+#define MP_NUMTH      1  /* include number theoretic functions? */
+#endif
+
+#ifndef MP_LOGTAB
+#define MP_LOGTAB     1  /* use table of logs instead of log()? */
+#endif
+
+#ifndef MP_MEMSET
+#define MP_MEMSET     1  /* use memset() to zero buffers?       */
+#endif
+
+#ifndef MP_MEMCPY
+#define MP_MEMCPY     1  /* use memcpy() to copy buffers?       */
+#endif
+
+#ifndef MP_CRYPTO
+#define MP_CRYPTO     1  /* erase memory on free?               */
+#endif
+
+#ifndef MP_ARGCHK
+/*
+  0 = no parameter checks
+  1 = runtime checks, continue execution and return an error to caller
+  2 = assertions; dump core on parameter errors
+ */
+#define MP_ARGCHK     2  /* how to check input arguments        */
+#endif
+
+#ifndef MP_DEBUG
+#define MP_DEBUG      0  /* print diagnostic output?            */
+#endif
+
+#ifndef MP_DEFPREC
+#define MP_DEFPREC    64 /* default precision, in digits        */
+#endif
+
+#ifndef MP_MACRO
+#define MP_MACRO      1  /* use macros for frequent calls?      */
+#endif
+
+#ifndef MP_SQUARE
+#define MP_SQUARE     1  /* use separate squaring code?         */
+#endif
+
+#ifndef MP_PTAB_SIZE
+/*
+  When building mpprime.c, we build in a table of small prime
+  values to use for primality testing.  The more you include,
+  the more space they take up.  See primes.c for the possible
+  values (currently 16, 32, 64, 128, 256, and 6542)
+ */
+#define MP_PTAB_SIZE  128  /* how many built-in primes?         */
+#endif
+
+#ifndef MP_COMPAT_MACROS
+#define MP_COMPAT_MACROS 1   /* define compatibility macros?    */
+#endif
+
+#endif /* ifndef MPI_CONFIG_H_ */
+
+
+/* crc==3287762869, version==2, Sat Feb 02 06:43:53 2002 */
--- a/mtest/mpi.c
+++ b/mtest/mpi.c
--- a/mtest/mtest.c
+++ b/mtest/mtest.c
@ -110,7 +110,7 @@ int main(void)
   t1 = clock();
   for (;;) {
      if (clock() - t1 > CLOCKS_PER_SEC) {
-         sleep(1);
+         sleep(2);
         t1 = clock();
      }

--- a/pics/makefile~
+++ b/pics/makefile~
@ -1,35 +0,0 @@
-# makes the images... yeah
-
-default:  pses
-
-design_process.ps: design_process.tif
-	tiff2ps -s -e design_process.tif > design_process.ps
-
-sliding_window.ps: sliding_window.tif
-	tiff2ps -e sliding_window.tif > sliding_window.ps
-	
-expt_state.ps: expt_state.tif
-	tiff2ps -e expt_state.tif > expt_state.ps
-
-primality.ps: primality.tif
-	tiff2ps -e primality.tif > primality.ps
-
-design_process.pdf: design_process.ps
-	epstopdf design_process.ps
-
-sliding_window.pdf: sliding_window.ps
-	epstopdf sliding_window.ps
-	
-expt_state.pdf: expt_state.ps
-	epstopdf expt_state.ps
-
-primality.pdf: primality.ps
-	epstopdf primality.ps
-
-
-pses: sliding_window.ps expt_state.ps primality.ps design_process.ps
-pdfes: sliding_window.pdf expt_state.pdf primality.pdf design_process.pdf
-
-clean:
-	rm -rf *.ps *.pdf .xvpics
-   
--- a/poster.pdf
+++ b/poster.pdf
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@ -631,8 +631,7 @@ fast_s_mp_mul_high_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
 * Based on Algorithm 14.16 on pp.597 of HAC.
 *
 */
-int
-fast_s_mp_sqr (mp_int * a, mp_int * b)
+int fast_s_mp_sqr (mp_int * a, mp_int * b)
 {
  int     olduse, newused, res, ix, pa;
  mp_word W2[MP_WARRAY], W[MP_WARRAY];
@ -1345,11 +1344,15 @@ int mp_cmp_mag (mp_int * a, mp_int * b)
 */
 #include <tommath.h>

+static const int lnz[16] = { 
+   4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0
+};
+
 /* Counts the number of lsbs which are zero before the first zero bit */
 int mp_cnt_lsb(mp_int *a)
 {
   int x;
-   mp_digit q;
+   mp_digit q, qq;

   /* easy out */
   if (mp_iszero(a) == 1) {
@ -1362,11 +1365,13 @@ int mp_cnt_lsb(mp_int *a)
   x *= DIGIT_BIT;

   /* now scan this digit until a 1 is found */
-   while ((q & 1) == 0) {
-      q >>= 1;
-      x  += 1;
+   if ((q & 1) == 0) {
+      do {
+         qq  = q & 15;
+         x  += lnz[qq];
+         q >>= 4;
+      } while (qq == 0);
   }
-
   return x;
 }

@ -2665,75 +2670,75 @@ __M:
 /* End: bn_mp_exptmod_fast.c */

 /* Start: bn_mp_exteuclid.c */
-/* LibTomMath, multiple-precision integer library -- Tom St Denis
- *
- * LibTomMath is a library that provides multiple-precision
- * integer arithmetic as well as number theoretic functionality.
- *
- * The library was designed directly after the MPI library by
- * Michael Fromberger but has been written from scratch with
- * additional optimizations in place.
- *
- * The library is free for all purposes without any express
- * guarantee it works.
- *
- * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
- */
-#include <tommath.h>
-
-/* Extended euclidean algorithm of (a, b) produces 
-   a*u1 + b*u2 = u3
- */
-int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
-{
-   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
-   int err;
-
-   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
-      return err;
-   }
-
-   /* initialize, (u1,u2,u3) = (1,0,a) */
-   mp_set(&u1, 1);
-   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* initialize, (v1,v2,v3) = (0,1,b) */
-   mp_set(&v2, 1);
-   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
-
-   /* loop while v3 != 0 */
-   while (mp_iszero(&v3) == MP_NO) {
-       /* q = u3/v3 */
-       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
-
-       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
-       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
-       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
-       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
-
-       /* (u1,u2,u3) = (v1,v2,v3) */
-       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
-
-       /* (v1,v2,v3) = (t1,t2,t3) */
-       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
-       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
-   }
-
-   /* copy result out */
-   if (U1 != NULL) { mp_exch(U1, &u1); }
-   if (U2 != NULL) { mp_exch(U2, &u2); }
-   if (U3 != NULL) { mp_exch(U3, &u3); }
-
-   err = MP_OKAY;
-_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
-   return err;
-}
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Extended euclidean algorithm of (a, b) produces 
+   a*u1 + b*u2 = u3
+ */
+int mp_exteuclid(mp_int *a, mp_int *b, mp_int *U1, mp_int *U2, mp_int *U3)
+{
+   mp_int u1,u2,u3,v1,v2,v3,t1,t2,t3,q,tmp;
+   int err;
+
+   if ((err = mp_init_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL)) != MP_OKAY) {
+      return err;
+   }
+
+   /* initialize, (u1,u2,u3) = (1,0,a) */
+   mp_set(&u1, 1);
+   if ((err = mp_copy(a, &u3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* initialize, (v1,v2,v3) = (0,1,b) */
+   mp_set(&v2, 1);
+   if ((err = mp_copy(b, &v3)) != MP_OKAY)                                        { goto _ERR; }
+
+   /* loop while v3 != 0 */
+   while (mp_iszero(&v3) == MP_NO) {
+       /* q = u3/v3 */
+       if ((err = mp_div(&u3, &v3, &q, NULL)) != MP_OKAY)                         { goto _ERR; }
+
+       /* (t1,t2,t3) = (u1,u2,u3) - (v1,v2,v3)q */
+       if ((err = mp_mul(&v1, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u1, &tmp, &t1)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v2, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u2, &tmp, &t2)) != MP_OKAY)                             { goto _ERR; }
+       if ((err = mp_mul(&v3, &q, &tmp)) != MP_OKAY)                              { goto _ERR; }
+       if ((err = mp_sub(&u3, &tmp, &t3)) != MP_OKAY)                             { goto _ERR; }
+
+       /* (u1,u2,u3) = (v1,v2,v3) */
+       if ((err = mp_copy(&v1, &u1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v2, &u2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&v3, &u3)) != MP_OKAY)                                  { goto _ERR; }
+
+       /* (v1,v2,v3) = (t1,t2,t3) */
+       if ((err = mp_copy(&t1, &v1)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t2, &v2)) != MP_OKAY)                                  { goto _ERR; }
+       if ((err = mp_copy(&t3, &v3)) != MP_OKAY)                                  { goto _ERR; }
+   }
+
+   /* copy result out */
+   if (U1 != NULL) { mp_exch(U1, &u1); }
+   if (U2 != NULL) { mp_exch(U2, &u2); }
+   if (U3 != NULL) { mp_exch(U3, &u3); }
+
+   err = MP_OKAY;
+_ERR: mp_clear_multi(&u1, &u2, &u3, &v1, &v2, &v3, &t1, &t2, &t3, &q, &tmp, NULL);
+   return err;
+}

 /* End: bn_mp_exteuclid.c */

@ -2828,7 +2833,7 @@ int mp_fwrite(mp_int *a, int radix, FILE *stream)
      return err;
   }

-   buf = XMALLOC (len);
+   buf = OPT_CAST(char) XMALLOC (len);
   if (buf == NULL) {
      return MP_MEM;
   }
@ -2963,6 +2968,49 @@ __U:mp_clear (&v);

 /* End: bn_mp_gcd.c */

+/* Start: bn_mp_get_int.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* get the lower 32-bits of an mp_int */
+unsigned long mp_get_int(mp_int * a) 
+{
+  int i;
+  unsigned long res;
+
+  if (a->used == 0) {
+     return 0;
+  }
+
+  /* get number of digits of the lsb we have to read */
+  i = MIN(a->used,(int)((sizeof(unsigned long)*CHAR_BIT+DIGIT_BIT-1)/DIGIT_BIT))-1;
+
+  /* get most significant digit of result */
+  res = DIGIT(a,i);
+   
+  while (--i >= 0) {
+    res = (res << DIGIT_BIT) | DIGIT(a,i);
+  }
+
+  /* force result to 32-bits always so it is consistent on non 32-bit platforms */
+  return res & 0xFFFFFFFFUL;
+}
+
+/* End: bn_mp_get_int.c */
+
 /* Start: bn_mp_grow.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
@ -2997,7 +3045,7 @@ int mp_grow (mp_int * a, int size)
     * in case the operation failed we don't want
     * to overwrite the dp member of a.
     */
-    tmp = OPT_CAST XREALLOC (a->dp, sizeof (mp_digit) * size);
+    tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * size);
    if (tmp == NULL) {
      /* reallocation failed but "a" is still valid [can be freed] */
      return MP_MEM;
@ -3039,7 +3087,7 @@ int mp_grow (mp_int * a, int size)
 int mp_init (mp_int * a)
 {
  /* allocate memory required and clear it */
-  a->dp = OPT_CAST XCALLOC (sizeof (mp_digit), MP_PREC);
+  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), MP_PREC);
  if (a->dp == NULL) {
    return MP_MEM;
  }
@ -3142,6 +3190,65 @@ int mp_init_multi(mp_int *mp, ...)

 /* End: bn_mp_init_multi.c */

+/* Start: bn_mp_init_set.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  mp_set(a, b);
+  return err;
+}
+
+/* End: bn_mp_init_set.c */
+
+/* Start: bn_mp_init_set_int.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* initialize and set a digit */
+int mp_init_set_int (mp_int * a, unsigned long b)
+{
+  int err;
+  if ((err = mp_init(a)) != MP_OKAY) {
+     return err;
+  }
+  return mp_set_int(a, b);
+}
+
+/* End: bn_mp_init_set_int.c */
+
 /* Start: bn_mp_init_size.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
@ -3166,7 +3273,7 @@ int mp_init_size (mp_int * a, int size)
  size += (MP_PREC * 2) - (size % MP_PREC);	
  
  /* alloc mem */
-  a->dp = OPT_CAST XCALLOC (sizeof (mp_digit), size);
+  a->dp = OPT_CAST(mp_digit) XCALLOC (sizeof (mp_digit), size);
  if (a->dp == NULL) {
    return MP_MEM;
  }
@ -3357,6 +3464,113 @@ __ERR:mp_clear_multi (&x, &y, &u, &v, &A, &B, &C, &D, NULL);

 /* End: bn_mp_invmod.c */

+/* Start: bn_mp_is_square.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* Check if remainders are possible squares - fast exclude non-squares */
+static const char rem_128[128] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1
+};
+
+static const char rem_105[105] = {
+ 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+ 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1,
+ 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1,
+ 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+ 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
+ 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
+ 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1
+};
+
+/* Store non-zero to ret if arg is square, and zero if not */
+int mp_is_square(mp_int *arg,int *ret) 
+{
+  int           res;
+  mp_digit      c;
+  mp_int        t;
+  unsigned long r;
+
+  /* Default to Non-square :) */
+  *ret = MP_NO; 
+
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* digits used?  (TSD) */
+  if (arg->used == 0) {
+     return MP_OKAY;
+  }
+
+  /* First check mod 128 (suppose that DIGIT_BIT is at least 7) */
+  if (rem_128[127 & DIGIT(arg,0)] == 1) {
+     return MP_OKAY;
+  }
+
+  /* Next check mod 105 (3*5*7) */
+  if ((res = mp_mod_d(arg,105,&c)) != MP_OKAY) {
+     return res;
+  }
+  if (rem_105[c] == 1) {
+     return MP_OKAY;
+  }
+
+  /* product of primes less than 2^31 */
+  if ((res = mp_init_set_int(&t,11L*13L*17L*19L*23L*29L*31L)) != MP_OKAY) {
+     return res;
+  }
+  if ((res = mp_mod(arg,&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  r = mp_get_int(&t);
+  /* Check for other prime modules, note it's not an ERROR but we must
+   * free "t" so the easiest way is to goto ERR.  We know that res
+   * is already equal to MP_OKAY from the mp_mod call 
+   */ 
+  if ( (1L<<(r%11)) & 0x5C4L )             goto ERR;
+  if ( (1L<<(r%13)) & 0x9E4L )             goto ERR;
+  if ( (1L<<(r%17)) & 0x5CE8L )            goto ERR;
+  if ( (1L<<(r%19)) & 0x4F50CL )           goto ERR;
+  if ( (1L<<(r%23)) & 0x7ACCA0L )          goto ERR;
+  if ( (1L<<(r%29)) & 0xC2EDD0CL )         goto ERR;
+  if ( (1L<<(r%31)) & 0x6DE2B848L )        goto ERR;
+
+  /* Final check - is sqr(sqrt(arg)) == arg ? */
+  if ((res = mp_sqrt(arg,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+  if ((res = mp_sqr(&t,&t)) != MP_OKAY) {
+     goto ERR;
+  }
+
+  *ret = (mp_cmp_mag(&t,arg) == MP_EQ) ? MP_YES : MP_NO;
+ERR:mp_clear(&t);
+  return res;
+}
+
+/* End: bn_mp_is_square.c */
+
 /* Start: bn_mp_jacobi.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
@ -3506,8 +3720,7 @@ __A1:mp_clear (&a1);
 * Generally though the overhead of this method doesn't pay off 
 * until a certain size (N ~ 80) is reached.
 */
-int
-mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
+int mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
 {
  mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
  int     B, err;
@ -3519,7 +3732,7 @@ mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
  B = MIN (a->used, b->used);

  /* now divide in two */
-  B = B / 2;
+  B = B >> 1;

  /* init copy all the temps */
  if (mp_init_size (&x0, B) != MP_OKAY)
@ -3653,8 +3866,7 @@ ERR:
 * is essentially the same algorithm but merely 
 * tuned to perform recursive squarings.
 */
-int
-mp_karatsuba_sqr (mp_int * a, mp_int * b)
+int mp_karatsuba_sqr (mp_int * a, mp_int * b)
 {
  mp_int  x0, x1, t1, t2, x0x0, x1x1;
  int     B, err;
@ -3665,7 +3877,7 @@ mp_karatsuba_sqr (mp_int * a, mp_int * b)
  B = a->used;

  /* now divide in two */
-  B = B / 2;
+  B = B >> 1;

  /* init copy all the temps */
  if (mp_init_size (&x0, B) != MP_OKAY)
@ -3896,7 +4108,6 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
  mp_int  t;
  int     res;

-
  if ((res = mp_init (&t)) != MP_OKAY) {
    return res;
  }
@ -3906,7 +4117,7 @@ mp_mod (mp_int * a, mp_int * b, mp_int * c)
    return res;
  }

-  if (t.sign == MP_NEG) {
+  if (t.sign != b->sign) {
    res = mp_add (b, &t, c);
  } else {
    res = MP_OKAY;
@ -4661,7 +4872,7 @@ int mp_n_root (mp_int * a, mp_digit b, mp_int * c)

    if (mp_cmp (&t2, a) == MP_GT) {
      if ((res = mp_sub_d (&t1, 1, &t1)) != MP_OKAY) {
-    goto __T3;
+         goto __T3;
      }
    } else {
      break;
@ -4711,7 +4922,7 @@ int mp_neg (mp_int * a, mp_int * b)
  if ((res = mp_copy (a, b)) != MP_OKAY) {
    return res;
  }
-  if (mp_iszero(b) != 1) {
+  if (mp_iszero(b) != MP_YES) {
     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
  }
  return MP_OKAY;
@ -5225,7 +5436,7 @@ __ERR:

 /* End: bn_mp_prime_next_prime.c */

-/* Start: bn_mp_prime_random.c */
+/* Start: bn_mp_prime_random_ex.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
 * LibTomMath is a library that provides multiple-precision
@ -5242,57 +5453,101 @@ __ERR:
 */
 #include <tommath.h>

-/* makes a truly random prime of a given size (bytes),
- * call with bbs = 1 if you want it to be congruent to 3 mod 4 
+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
 *
 * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
 * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
 * so it can be NULL
 *
- * The prime generated will be larger than 2^(8*size).
 */

-/* this sole function may hold the key to enslaving all mankind! */
-int mp_prime_random(mp_int *a, int t, int size, int bbs, ltm_prime_callback cb, void *dat)
+/* This is possibly the mother of all prime generation functions, muahahahahaha! */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat)
 {
-   unsigned char *tmp;
-   int res, err;
+   unsigned char *tmp, maskAND, maskOR_msb, maskOR_lsb;
+   int res, err, bsize, maskOR_msb_offset;

   /* sanity check the input */
-   if (size <= 0) {
+   if (size <= 1 || t <= 0) {
      return MP_VAL;
   }

-   /* we need a buffer of size+1 bytes */
-   tmp = XMALLOC(size+1);
+   /* LTM_PRIME_SAFE implies LTM_PRIME_BBS */
+   if (flags & LTM_PRIME_SAFE) {
+      flags |= LTM_PRIME_BBS;
+   }
+
+   /* calc the byte size */
+   bsize = (size>>3)+(size&7?1:0);
+
+   /* we need a buffer of bsize bytes */
+   tmp = OPT_CAST(unsigned char) XMALLOC(bsize);
   if (tmp == NULL) {
      return MP_MEM;
   }

-   /* fix MSB */
-   tmp[0] = 1;
+   /* calc the maskAND value for the MSbyte*/
+   maskAND = 0xFF >> (8 - (size & 7));
+
+   /* calc the maskOR_msb */
+   maskOR_msb        = 0;
+   maskOR_msb_offset = (size - 2) >> 3;
+   if (flags & LTM_PRIME_2MSB_ON) {
+      maskOR_msb     |= 1 << ((size - 2) & 7);
+   } else if (flags & LTM_PRIME_2MSB_OFF) {
+      maskAND        &= ~(1 << ((size - 2) & 7));
+   }
+
+   /* get the maskOR_lsb */
+   maskOR_lsb         = 0;
+   if (flags & LTM_PRIME_BBS) {
+      maskOR_lsb     |= 3;
+   }

   do {
      /* read the bytes */
-      if (cb(tmp+1, size, dat) != size) {
+      if (cb(tmp, bsize, dat) != bsize) {
         err = MP_VAL;
         goto error;
      }
 
-      /* fix the LSB */
-      tmp[size] |= (bbs ? 3 : 1);
+      /* work over the MSbyte */
+      tmp[0]    &= maskAND;
+      tmp[0]    |= 1 << ((size - 1) & 7);
+
+      /* mix in the maskORs */
+      tmp[maskOR_msb_offset]   |= maskOR_msb;
+      tmp[bsize-1]             |= maskOR_lsb;

      /* read it in */
-      if ((err = mp_read_unsigned_bin(a, tmp, size+1)) != MP_OKAY) {
-         goto error;
-      }
+      if ((err = mp_read_unsigned_bin(a, tmp, bsize)) != MP_OKAY)     { goto error; }

      /* is it prime? */
-      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY) {
-         goto error;
+      if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)           { goto error; }
+
+      if (flags & LTM_PRIME_SAFE) {
+         /* see if (a-1)/2 is prime */
+         if ((err = mp_sub_d(a, 1, a)) != MP_OKAY)                    { goto error; }
+         if ((err = mp_div_2(a, a)) != MP_OKAY)                       { goto error; }
+ 
+         /* is it prime? */
+         if ((err = mp_prime_is_prime(a, t, &res)) != MP_OKAY)        { goto error; }
      }
   } while (res == MP_NO);

+   if (flags & LTM_PRIME_SAFE) {
+      /* restore a to the original value */
+      if ((err = mp_mul_2(a, a)) != MP_OKAY)                          { goto error; }
+      if ((err = mp_add_d(a, 1, a)) != MP_OKAY)                       { goto error; }
+   }
+
   err = MP_OKAY;
 error:
   XFREE(tmp);
@ -5301,7 +5556,7 @@ error:



-/* End: bn_mp_prime_random.c */
+/* End: bn_mp_prime_random_ex.c */

 /* Start: bn_mp_radix_size.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
@ -5726,9 +5981,9 @@ CLEANUP:
 */
 #include <tommath.h>

-/* reduces a modulo n where n is of the form 2**p - k */
+/* reduces a modulo n where n is of the form 2**p - d */
 int
-mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
+mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d)
 {
   mp_int q;
   int    p, res;
@ -5744,9 +5999,9 @@ top:
      goto ERR;
   }
   
-   if (k != 1) {
-      /* q = q * k */
-      if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) { 
+   if (d != 1) {
+      /* q = q * d */
+      if ((res = mp_mul_d(&q, d, &q)) != MP_OKAY) { 
         goto ERR;
      }
   }
@ -6062,7 +6317,7 @@ int mp_shrink (mp_int * a)
 {
  mp_digit *tmp;
  if (a->alloc != a->used && a->used > 0) {
-    if ((tmp = OPT_CAST XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
+    if ((tmp = OPT_CAST(mp_digit) XREALLOC (a->dp, sizeof (mp_digit) * a->used)) == NULL) {
      return MP_MEM;
    }
    a->dp    = tmp;
@ -6182,6 +6437,85 @@ mp_sqrmod (mp_int * a, mp_int * b, mp_int * c)

 /* End: bn_mp_sqrmod.c */

+/* Start: bn_mp_sqrt.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* this function is less generic than mp_n_root, simpler and faster */
+int mp_sqrt(mp_int *arg, mp_int *ret) 
+{
+  int res;
+  mp_int t1,t2;
+
+  /* must be positive */
+  if (arg->sign == MP_NEG) {
+    return MP_VAL;
+  }
+
+  /* easy out */
+  if (mp_iszero(arg) == MP_YES) {
+    mp_zero(ret);
+    return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy(&t1, arg)) != MP_OKAY) {
+    return res;
+  }
+
+  if ((res = mp_init(&t2)) != MP_OKAY) {
+    goto E2;
+  }
+
+  /* First approx. (not very bad for large arg) */
+  mp_rshd (&t1,t1.used/2);
+
+  /* t1 > 0  */ 
+  if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+    goto E1;
+  }
+  /* And now t1 > sqrt(arg) */
+  do { 
+    if ((res = mp_div(arg,&t1,&t2,NULL)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_add(&t1,&t2,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    if ((res = mp_div_2(&t1,&t1)) != MP_OKAY) {
+      goto E1;
+    }
+    /* t1 >= sqrt(arg) >= t2 at this point */
+  } while (mp_cmp_mag(&t1,&t2) == MP_GT);
+
+  mp_exch(&t1,ret);
+
+E1: mp_clear(&t2);
+E2: mp_clear(&t1);
+  return res;
+}
+
+
+/* End: bn_mp_sqrt.c */
+
 /* Start: bn_mp_sub.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
@ -6463,8 +6797,7 @@ mp_to_unsigned_bin (mp_int * a, unsigned char *b)
 #include <tommath.h>

 /* multiplication using the Toom-Cook 3-way algorithm */
-int 
-mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
+int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
 {
    mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
    int res, B;
@ -7019,6 +7352,93 @@ int mp_toradix (mp_int * a, char *str, int radix)

 /* End: bn_mp_toradix.c */

+/* Start: bn_mp_toradix_n.c */
+/* LibTomMath, multiple-precision integer library -- Tom St Denis
+ *
+ * LibTomMath is a library that provides multiple-precision
+ * integer arithmetic as well as number theoretic functionality.
+ *
+ * The library was designed directly after the MPI library by
+ * Michael Fromberger but has been written from scratch with
+ * additional optimizations in place.
+ *
+ * The library is free for all purposes without any express
+ * guarantee it works.
+ *
+ * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
+ */
+#include <tommath.h>
+
+/* stores a bignum as a ASCII string in a given radix (2..64) 
+ *
+ * Stores upto maxlen-1 chars and always a NULL byte 
+ */
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen)
+{
+  int     res, digs;
+  mp_int  t;
+  mp_digit d;
+  char   *_s = str;
+
+  /* check range of the maxlen, radix */
+  if (maxlen < 3 || radix < 2 || radix > 64) {
+    return MP_VAL;
+  }
+
+  /* quick out if its zero */
+  if (mp_iszero(a) == 1) {
+     *str++ = '0';
+     *str = '\0';
+     return MP_OKAY;
+  }
+
+  if ((res = mp_init_copy (&t, a)) != MP_OKAY) {
+    return res;
+  }
+
+  /* if it is negative output a - */
+  if (t.sign == MP_NEG) {
+    /* we have to reverse our digits later... but not the - sign!! */
+    ++_s;
+
+    /* store the flag and mark the number as positive */
+    *str++ = '-';
+    t.sign = MP_ZPOS;
+ 
+    /* subtract a char */
+    --maxlen;
+  }
+
+  digs = 0;
+  while (mp_iszero (&t) == 0) {
+    if ((res = mp_div_d (&t, (mp_digit) radix, &t, &d)) != MP_OKAY) {
+      mp_clear (&t);
+      return res;
+    }
+    *str++ = mp_s_rmap[d];
+    ++digs;
+
+    if (--maxlen == 1) {
+       /* no more room */
+       break;
+    }
+  }
+
+  /* reverse the digits of the string.  In this case _s points
+   * to the first digit [exluding the sign] of the number]
+   */
+  bn_reverse ((unsigned char *)_s, digs);
+
+  /* append a NULL so the string is properly terminated */
+  *str = '\0';
+
+  mp_clear (&t);
+  return MP_OKAY;
+}
+
+
+/* End: bn_mp_toradix_n.c */
+
 /* Start: bn_mp_unsigned_bin_size.c */
 /* LibTomMath, multiple-precision integer library -- Tom St Denis
 *
@ -7814,8 +8234,8 @@ s_mp_sqr (mp_int * a, mp_int * b)
  pa = a->used;
  if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) {
    return res;
-  }
-
+  }
+
  /* default used is maximum possible size */
  t.used = 2*pa + 1;

--- a/tommath.h
+++ b/tommath.h
@ -30,12 +30,12 @@
 extern "C" {

 /* C++ compilers don't like assigning void * to mp_digit * */
-#define  OPT_CAST  (mp_digit *)
+#define  OPT_CAST(x)  (x *)

 #else

 /* C on the other hand doesn't care */
-#define  OPT_CAST
+#define  OPT_CAST(x)

 #endif

@ -99,13 +99,13 @@ extern "C" {
       #define XFREE    free
       #define XREALLOC realloc
       #define XCALLOC  calloc
+   #else
+      /* prototypes for our heap functions */
+      extern void *XMALLOC(size_t n);
+      extern void *REALLOC(void *p, size_t n);
+      extern void *XCALLOC(size_t n, size_t s);
+      extern void XFREE(void *p);
   #endif
-
-   /* prototypes for our heap functions */
-   extern void *XMALLOC(size_t n);
-   extern void *REALLOC(void *p, size_t n);
-   extern void *XCALLOC(size_t n, size_t s);
-   extern void XFREE(void *p);
 #endif


@ -134,6 +134,12 @@ extern "C" {
 #define MP_YES        1   /* yes response */
 #define MP_NO         0   /* no response */

+/* Primality generation flags */
+#define LTM_PRIME_BBS      0x0001 /* BBS style prime */
+#define LTM_PRIME_SAFE     0x0002 /* Safe prime (p-1)/2 == prime */
+#define LTM_PRIME_2MSB_OFF 0x0004 /* force 2nd MSB to 0 */
+#define LTM_PRIME_2MSB_ON  0x0008 /* force 2nd MSB to 1 */
+
 typedef int           mp_err;

 /* you'll have to tune these... */
@ -142,12 +148,18 @@ extern int KARATSUBA_MUL_CUTOFF,
           TOOM_MUL_CUTOFF,
           TOOM_SQR_CUTOFF;

-/* various build options */
-#define MP_PREC                 64     /* default digits of precision */
-
 /* define this to use lower memory usage routines (exptmods mostly) */
 /* #define MP_LOW_MEM */

+/* default precision */
+#ifndef MP_PREC
+   #ifdef MP_LOW_MEM
+      #define MP_PREC                 64     /* default digits of precision */
+   #else
+      #define MP_PREC                 8      /* default digits of precision */
+   #endif   
+#endif
+
 /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER_DIGIT*2) */
 #define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGIT_BIT + 1))

@ -207,6 +219,15 @@ void mp_set(mp_int *a, mp_digit b);
 /* set a 32-bit const */
 int mp_set_int(mp_int *a, unsigned long b);

+/* get a 32-bit value */
+unsigned long mp_get_int(mp_int * a);
+
+/* initialize and set a digit */
+int mp_init_set (mp_int * a, mp_digit b);
+
+/* initialize and set 32-bit value */
+int mp_init_set_int (mp_int * a, unsigned long b);
+
 /* copy, b = a */
 int mp_copy(mp_int *a, mp_int *b);

@ -350,8 +371,11 @@ int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
 */
 int mp_n_root(mp_int *a, mp_digit b, mp_int *c);

-/* shortcut for square root */
-#define mp_sqrt(a, b) mp_n_root(a, 2, b)
+/* special sqrt algo */
+int mp_sqrt(mp_int *arg, mp_int *ret);
+
+/* is number a square? */
+int mp_is_square(mp_int *arg, int *ret);

 /* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
 int mp_jacobi(mp_int *a, mp_int *n, int *c);
@ -393,7 +417,7 @@ int mp_reduce_is_2k(mp_int *a);
 int mp_reduce_2k_setup(mp_int *a, mp_digit *d);

 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
-int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k);
+int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit d);

 /* d = a**b (mod c) */
 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
@ -453,8 +477,23 @@ int mp_prime_next_prime(mp_int *a, int t, int bbs_style);
 *
 * The prime generated will be larger than 2^(8*size).
 */
-int mp_prime_random(mp_int *a, int t, int size, int bbs, ltm_prime_callback cb, void *dat);
+#define mp_prime_random(a, t, size, bbs, cb, dat) mp_prime_random_ex(a, t, ((size) * 8) + 1, (bbs==1)?LTM_PRIME_BBS:0, cb, dat)

+/* makes a truly random prime of a given size (bits),
+ *
+ * Flags are as follows:
+ * 
+ *   LTM_PRIME_BBS      - make prime congruent to 3 mod 4
+ *   LTM_PRIME_SAFE     - make sure (p-1)/2 is prime as well (implies LTM_PRIME_BBS)
+ *   LTM_PRIME_2MSB_OFF - make the 2nd highest bit zero
+ *   LTM_PRIME_2MSB_ON  - make the 2nd highest bit one
+ *
+ * You have to supply a callback which fills in a buffer with random bytes.  "dat" is a parameter you can
+ * have passed to the callback (e.g. a state or something).  This function doesn't use "dat" itself
+ * so it can be NULL
+ *
+ */
+int mp_prime_random_ex(mp_int *a, int t, int size, int flags, ltm_prime_callback cb, void *dat);

 /* ---> radix conversion <--- */
 int mp_count_bits(mp_int *a);
@ -469,6 +508,7 @@ int mp_to_signed_bin(mp_int *a, unsigned char *b);

 int mp_read_radix(mp_int *a, char *str, int radix);
 int mp_toradix(mp_int *a, char *str, int radix);
+int mp_toradix_n(mp_int * a, char *str, int radix, int maxlen);
 int mp_radix_size(mp_int *a, int radix, int *size);

 int mp_fread(mp_int *a, int radix, FILE *stream);
--- a/tommath.pdf
+++ b/tommath.pdf
--- a/tommath.src
+++ b/tommath.src
@ -49,7 +49,7 @@
 \begin{document}
 \frontmatter
 \pagestyle{empty}
-\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Holiday Draft Edition }
+\title{Implementing Multiple Precision Arithmetic \\ ~ \\ Draft Edition }
 \author{\mbox{
 %\begin{small}
 \begin{tabular}{c}
@ -66,7 +66,7 @@ QUALCOMM Australia \\
 }
 }
 \maketitle
-This text has been placed in the public domain.  This text corresponds to the v0.28 release of the 
+This text has been placed in the public domain.  This text corresponds to the v0.30 release of the 
 LibTomMath project.

 \begin{alltt}
@ -85,7 +85,7 @@ This text is formatted to the international B5 paper size of 176mm wide by 250mm

 \tableofcontents
 \listoffigures
-\chapter*{Prefaces to the Holiday Draft Edition}
+\chapter*{Prefaces to the Draft Edition}
 I started this text in April 2003 to complement my LibTomMath library.  That is, explain how to implement the functions
 contained in LibTomMath.  The goal is to have a textbook that any Computer Science student can use when implementing their
 own multiple precision arithmetic.  The plan I wanted to follow was flesh out all the
@ -100,7 +100,7 @@ to read it.  I had Jean-Luc Cooke print copies for me and I brought them to Cryp
 managed to grab a certain level of attention having people from around the world ask me for copies of the text was certain
 rewarding.

-Now we are in December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
+Now we are past December 2003.  By this time I had pictured that I would have at least finished my second draft of the text.  
 Currently I am far off from this goal.  I've done partial re-writes of chapters one, two and three but they are not even
 finished yet.  I haven't given up on the project, only had some setbacks.  First O'Reilly declined to publish the text then
 Addison-Wesley and Greg is tried another which I don't know the name of.  However, at this point I want to focus my energy
@ -146,9 +146,6 @@ plan is to edit one chapter every two weeks starting January 4th.  It seems insa
 should provide ample time.  By Crypto'04 I plan to have a 2nd draft of the text polished and ready to hand out to as many
 people who will take it.

-Finally, again, I'd like to thank my parents Vern and Katie St Denis for giving me a place to stay, food, clothes and 
-word of encouragement whenever I seemed to need it.  Thanks!
-
 \begin{flushright} Tom St Denis \end{flushright}

 \newpage
@ -485,7 +482,7 @@ exponentiation and Montgomery reduction have been provided to make the library m
 Even with the nearly optimal and specialized algorithms that have been included the Application Programing Interface 
 (\textit{API}) has been kept as simple as possible.  Often generic place holder routines will make use of specialized 
 algorithms automatically without the developer's specific attention.  One such example is the generic multiplication 
-algorithm \textbf{mp\_mul()} which will automatically use Karatsuba, Toom-Cook, Comba or baseline multiplication 
+algorithm \textbf{mp\_mul()} which will automatically use Toom--Cook, Karatsuba, Comba or baseline multiplication 
 based on the magnitude of the inputs and the configuration of the library.  

 Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
@ -510,12 +507,12 @@ segments of code littered throughout the source.  This clean and uncluttered app
 developer can more readily discern the true intent of a given section of source code without trying to keep track of
 what conditional code will be used.

-The code base of LibTomMath is also well organized.  Each function is in its own separate source code file 
-which allows the reader to find a given function very quickly.  On average there are about $76$ lines of code per source
+The code base of LibTomMath is well organized.  Each function is in its own separate source code file 
+which allows the reader to find a given function very quickly.  On average there are $76$ lines of code per source
 file which makes the source very easily to follow.  By comparison MPI and LIP are single file projects making code tracing
 very hard.  GMP has many conditional code segments which also hinder tracing.  

-When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $66$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
+When compiled with GCC for the x86 processor and optimized for speed the entire library is approximately $100$KiB\footnote{The notation ``KiB'' means $2^{10}$ octets, similarly ``MiB'' means $2^{20}$ octets.}
 which is fairly small compared to GMP (over $250$KiB).  LibTomMath is slightly larger than MPI (which compiles to about 
 $50$KiB) but LibTomMath is also much faster and more complete than MPI.

@ -2736,7 +2733,7 @@ general purpose multiplication.  Given two polynomial basis representations $f(x
 light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.

 \begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) + ac + bd)x + bd
+f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) - (ac + bd))x + bd
 \end{equation}

 Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
@ -3196,7 +3193,7 @@ Upon closer inspection this equation only requires the calculation of three half
 Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
 $O \left ( n^{lg(3)} \right )$.

-You might ask yourself, if the asymptotic time of Karatsuba squaring and multiplication is the same, why not simply use the multiplication algorithm 
+If the asymptotic times of Karatsuba squaring and multiplication are the same, why not simply use the multiplication algorithm 
 instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
 time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
 point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
--- a/tommath.tex
+++ b/tommath.tex