diff --git a/bn.pdf b/bn.pdf
index 37866c8..bdbc819 100644
Binary files a/bn.pdf and b/bn.pdf differ
diff --git a/bn.tex b/bn.tex
index a69c13c..3c07991 100644
--- a/bn.tex
+++ b/bn.tex
@@ -1,7 +1,7 @@
 \documentclass[]{article}
 \begin{document}
 
-\title{LibTomMath v0.19 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
+\title{LibTomMath v0.20 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org }
 \author{Tom St Denis \\ tomstdenis@iahu.ca}
 \maketitle
 \newpage
diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c
index de42ff8..54de53d 100644
--- a/bn_mp_exptmod_fast.c
+++ b/bn_mp_exptmod_fast.c
@@ -80,7 +80,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      if (((P->used * 2 + 1) < MP_WARRAY) &&
           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
         redux = fast_mp_montgomery_reduce;
-
      } else {
         /* use slower baselien method */
         redux = mp_montgomery_reduce;
diff --git a/changes.txt b/changes.txt
index f874a2b..5756d6a 100644
--- a/changes.txt
+++ b/changes.txt
@@ -1,3 +1,7 @@
+June 8th, 2003
+v0.20  -- Removed the book from the package.  Added the TDCAL license document.  
+       -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16]
+
 June 6th, 2003
 v0.19  -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release.
           Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time.
diff --git a/etc/2kprime.1 b/etc/2kprime.1
index eb12565..e1384db 100644
--- a/etc/2kprime.1
+++ b/etc/2kprime.1
@@ -1,2 +1 @@
-256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823
-512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979
+259-bits (k = 17745) = 926336713898529563388567880069503262826159877325124512315660672063305037101743
diff --git a/etc/2kprime.c b/etc/2kprime.c
index 47b0e1d..4f1d4bb 100644
--- a/etc/2kprime.c
+++ b/etc/2kprime.c
@@ -7,7 +7,7 @@ int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096};
 int main(void)
 {
    char buf[2000];
-   int x, y, t;
+   int x, y;
    mp_int q, p;
    FILE *out;
    clock_t t1;
diff --git a/makefile b/makefile
index 15972af..a835a2e 100644
--- a/makefile
+++ b/makefile
@@ -1,6 +1,6 @@
 CFLAGS  +=  -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops
 
-VERSION=0.19
+VERSION=0.20
 
 default: libtommath.a
 
@@ -103,5 +103,6 @@ clean:
 zipup: clean manual poster
 	perl gen.pl ; mv mpi.c pre_gen/ ; \
 	cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \
-	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
+	cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; cp tdcal.pdf ./libtommath-$(VERSION)/ ; cd ./libtommath-$(VERSION) ; rm -f tommath.src tommath.tex tommath.out ; cd pics ; rm -f * ; cd .. ; cd .. ; ls ; \
+	tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \
 	bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/*
diff --git a/pics/expt_state.sxd b/pics/expt_state.sxd
deleted file mode 100644
index 6518404..0000000
Binary files a/pics/expt_state.sxd and /dev/null differ
diff --git a/pics/expt_state.tif b/pics/expt_state.tif
deleted file mode 100644
index cb06e8e..0000000
Binary files a/pics/expt_state.tif and /dev/null differ
diff --git a/pics/makefile b/pics/makefile
deleted file mode 100644
index 302adec..0000000
--- a/pics/makefile
+++ /dev/null
@@ -1,23 +0,0 @@
-# makes the images... yeah
-
-default:  pses
-
-
-sliding_window.ps: sliding_window.tif
-	tiff2ps -c -e sliding_window.tif > sliding_window.ps
-	
-expt_state.ps: expt_state.tif
-	tiff2ps -c -e expt_state.tif > expt_state.ps
-
-sliding_window.pdf: sliding_window.ps
-	epstopdf sliding_window.ps
-	
-expt_state.pdf: expt_state.ps
-	epstopdf expt_state.ps
-
-pses: sliding_window.ps expt_state.ps
-pdfes: sliding_window.pdf expt_state.pdf
-
-clean:
-	rm -rf *.ps *.pdf .xvpics
-   
\ No newline at end of file
diff --git a/pics/sliding_window.TIF b/pics/sliding_window.TIF
deleted file mode 100644
index bb4cb96..0000000
Binary files a/pics/sliding_window.TIF and /dev/null differ
diff --git a/pics/sliding_window.sxd b/pics/sliding_window.sxd
deleted file mode 100644
index 91e7c0d..0000000
Binary files a/pics/sliding_window.sxd and /dev/null differ
diff --git a/poster.pdf b/poster.pdf
index a1ecd06..629336c 100644
Binary files a/poster.pdf and b/poster.pdf differ
diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c
index e5b9347..c728269 100644
--- a/pre_gen/mpi.c
+++ b/pre_gen/mpi.c
@@ -2155,7 +2155,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode)
      if (((P->used * 2 + 1) < MP_WARRAY) &&
           P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) {
         redux = fast_mp_montgomery_reduce;
-
      } else {
         /* use slower baselien method */
         redux = mp_montgomery_reduce;
diff --git a/tdcal.pdf b/tdcal.pdf
new file mode 100644
index 0000000..1566a9d
Binary files /dev/null and b/tdcal.pdf differ
diff --git a/tommath.out b/tommath.out
deleted file mode 100644
index fb54c12..0000000
--- a/tommath.out
+++ /dev/null
@@ -1,143 +0,0 @@
-\BOOKMARK [0][-]{chapter.1}{Introduction}{}
-\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1}
-\BOOKMARK [2][-]{subsection.1.1.1}{The Need for Multiple Precision Arithmetic}{section.1.1}
-\BOOKMARK [2][-]{subsection.1.1.2}{Multiple Precision Arithmetic}{section.1.1}
-\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1}
-\BOOKMARK [2][-]{subsection.1.1.4}{Basis of Operations}{section.1.1}
-\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1}
-\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1}
-\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3}
-\BOOKMARK [2][-]{subsection.1.3.2}{Work Effort}{section.1.3}
-\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1}
-\BOOKMARK [0][-]{chapter.2}{Introduction to LibTomMath}{}
-\BOOKMARK [1][-]{section.2.1}{What is LibTomMath?}{chapter.2}
-\BOOKMARK [1][-]{section.2.2}{Goals of LibTomMath}{chapter.2}
-\BOOKMARK [1][-]{section.2.3}{Choice of LibTomMath}{chapter.2}
-\BOOKMARK [2][-]{subsection.2.3.1}{Code Base}{section.2.3}
-\BOOKMARK [2][-]{subsection.2.3.2}{API Simplicity}{section.2.3}
-\BOOKMARK [2][-]{subsection.2.3.3}{Optimizations}{section.2.3}
-\BOOKMARK [2][-]{subsection.2.3.4}{Portability and Stability}{section.2.3}
-\BOOKMARK [2][-]{subsection.2.3.5}{Choice}{section.2.3}
-\BOOKMARK [0][-]{chapter.3}{Getting Started}{}
-\BOOKMARK [1][-]{section.3.1}{Library Basics}{chapter.3}
-\BOOKMARK [1][-]{section.3.2}{What is a Multiple Precision Integer?}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.2.1}{The mp\137int structure}{section.3.2}
-\BOOKMARK [1][-]{section.3.3}{Argument Passing}{chapter.3}
-\BOOKMARK [1][-]{section.3.4}{Return Values}{chapter.3}
-\BOOKMARK [1][-]{section.3.5}{Initialization and Clearing}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.5.1}{Initializing an mp\137int}{section.3.5}
-\BOOKMARK [2][-]{subsection.3.5.2}{Clearing an mp\137int}{section.3.5}
-\BOOKMARK [1][-]{section.3.6}{Other Initialization Routines}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.6.1}{Initializing Variable Sized mp\137int Structures}{section.3.6}
-\BOOKMARK [2][-]{subsection.3.6.2}{Creating a Clone}{section.3.6}
-\BOOKMARK [2][-]{subsection.3.6.3}{Multiple Integer Initializations And Clearings}{section.3.6}
-\BOOKMARK [1][-]{section.3.7}{Maintenance}{chapter.3}
-\BOOKMARK [2][-]{subsection.3.7.1}{Augmenting Integer Precision}{section.3.7}
-\BOOKMARK [2][-]{subsection.3.7.2}{Clamping Excess Digits}{section.3.7}
-\BOOKMARK [0][-]{chapter.4}{Basic Operations}{}
-\BOOKMARK [1][-]{section.4.1}{Copying an Integer}{chapter.4}
-\BOOKMARK [1][-]{section.4.2}{Zeroing an Integer}{chapter.4}
-\BOOKMARK [1][-]{section.4.3}{Sign Manipulation}{chapter.4}
-\BOOKMARK [2][-]{subsection.4.3.1}{Absolute Value}{section.4.3}
-\BOOKMARK [2][-]{subsection.4.3.2}{Integer Negation}{section.4.3}
-\BOOKMARK [1][-]{section.4.4}{Small Constants}{chapter.4}
-\BOOKMARK [2][-]{subsection.4.4.1}{Setting Small Constants}{section.4.4}
-\BOOKMARK [2][-]{subsection.4.4.2}{Setting Large Constants}{section.4.4}
-\BOOKMARK [1][-]{section.4.5}{Comparisons}{chapter.4}
-\BOOKMARK [2][-]{subsection.4.5.1}{Unsigned Comparisions}{section.4.5}
-\BOOKMARK [2][-]{subsection.4.5.2}{Signed Comparisons}{section.4.5}
-\BOOKMARK [0][-]{chapter.5}{Basic Arithmetic}{}
-\BOOKMARK [1][-]{section.5.1}{Building Blocks}{chapter.5}
-\BOOKMARK [1][-]{section.5.2}{Addition and Subtraction}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.2.1}{Low Level Addition}{section.5.2}
-\BOOKMARK [2][-]{subsection.5.2.2}{Low Level Subtraction}{section.5.2}
-\BOOKMARK [2][-]{subsection.5.2.3}{High Level Addition}{section.5.2}
-\BOOKMARK [2][-]{subsection.5.2.4}{High Level Subtraction}{section.5.2}
-\BOOKMARK [1][-]{section.5.3}{Bit and Digit Shifting}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.3.1}{Multiplication by Two}{section.5.3}
-\BOOKMARK [2][-]{subsection.5.3.2}{Division by Two}{section.5.3}
-\BOOKMARK [1][-]{section.5.4}{Polynomial Basis Operations}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.4.1}{Multiplication by x}{section.5.4}
-\BOOKMARK [2][-]{subsection.5.4.2}{Division by x}{section.5.4}
-\BOOKMARK [1][-]{section.5.5}{Powers of Two}{chapter.5}
-\BOOKMARK [2][-]{subsection.5.5.1}{Multiplication by Power of Two}{section.5.5}
-\BOOKMARK [2][-]{subsection.5.5.2}{Division by Power of Two}{section.5.5}
-\BOOKMARK [2][-]{subsection.5.5.3}{Remainder of Division by Power of Two}{section.5.5}
-\BOOKMARK [0][-]{chapter.6}{Multiplication and Squaring}{}
-\BOOKMARK [1][-]{section.6.1}{The Multipliers}{chapter.6}
-\BOOKMARK [1][-]{section.6.2}{Multiplication}{chapter.6}
-\BOOKMARK [2][-]{subsection.6.2.1}{The Baseline Multiplication}{section.6.2}
-\BOOKMARK [2][-]{subsection.6.2.2}{Faster Multiplication by the ``Comba'' Method}{section.6.2}
-\BOOKMARK [2][-]{subsection.6.2.3}{Polynomial Basis Multiplication}{section.6.2}
-\BOOKMARK [2][-]{subsection.6.2.4}{Karatsuba Multiplication}{section.6.2}
-\BOOKMARK [2][-]{subsection.6.2.5}{Toom-Cook 3-Way Multiplication}{section.6.2}
-\BOOKMARK [2][-]{subsection.6.2.6}{Signed Multiplication}{section.6.2}
-\BOOKMARK [1][-]{section.6.3}{Squaring}{chapter.6}
-\BOOKMARK [2][-]{subsection.6.3.1}{The Baseline Squaring Algorithm}{section.6.3}
-\BOOKMARK [2][-]{subsection.6.3.2}{Faster Squaring by the ``Comba'' Method}{section.6.3}
-\BOOKMARK [2][-]{subsection.6.3.3}{Polynomial Basis Squaring}{section.6.3}
-\BOOKMARK [2][-]{subsection.6.3.4}{Karatsuba Squaring}{section.6.3}
-\BOOKMARK [2][-]{subsection.6.3.5}{Toom-Cook Squaring}{section.6.3}
-\BOOKMARK [2][-]{subsection.6.3.6}{High Level Squaring}{section.6.3}
-\BOOKMARK [0][-]{chapter.7}{Modular Reduction}{}
-\BOOKMARK [1][-]{section.7.1}{Basics of Modular Reduction}{chapter.7}
-\BOOKMARK [1][-]{section.7.2}{The Barrett Reduction}{chapter.7}
-\BOOKMARK [2][-]{subsection.7.2.1}{Fixed Point Arithmetic}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.2}{Choosing a Radix Point}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.3}{Trimming the Quotient}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.4}{Trimming the Residue}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.5}{The Barrett Algorithm}{section.7.2}
-\BOOKMARK [2][-]{subsection.7.2.6}{The Barrett Setup Algorithm}{section.7.2}
-\BOOKMARK [1][-]{section.7.3}{The Montgomery Reduction}{chapter.7}
-\BOOKMARK [2][-]{subsection.7.3.1}{Digit Based Montgomery Reduction}{section.7.3}
-\BOOKMARK [2][-]{subsection.7.3.2}{Baseline Montgomery Reduction}{section.7.3}
-\BOOKMARK [2][-]{subsection.7.3.3}{Faster ``Comba'' Montgomery Reduction}{section.7.3}
-\BOOKMARK [2][-]{subsection.7.3.4}{Montgomery Setup}{section.7.3}
-\BOOKMARK [1][-]{section.7.4}{The Diminished Radix Algorithm}{chapter.7}
-\BOOKMARK [2][-]{subsection.7.4.1}{Choice of Moduli}{section.7.4}
-\BOOKMARK [2][-]{subsection.7.4.2}{Choice of k}{section.7.4}
-\BOOKMARK [2][-]{subsection.7.4.3}{Restricted Diminished Radix Reduction}{section.7.4}
-\BOOKMARK [2][-]{subsection.7.4.4}{Unrestricted Diminished Radix Reduction}{section.7.4}
-\BOOKMARK [1][-]{section.7.5}{Algorithm Comparison}{chapter.7}
-\BOOKMARK [0][-]{chapter.8}{Exponentiation}{}
-\BOOKMARK [1][-]{section.8.1}{Exponentiation Basics}{chapter.8}
-\BOOKMARK [2][-]{subsection.8.1.1}{Single Digit Exponentiation}{section.8.1}
-\BOOKMARK [1][-]{section.8.2}{k-ary Exponentiation}{chapter.8}
-\BOOKMARK [2][-]{subsection.8.2.1}{Optimal Values of k}{section.8.2}
-\BOOKMARK [2][-]{subsection.8.2.2}{Sliding-Window Exponentiation}{section.8.2}
-\BOOKMARK [1][-]{section.8.3}{Modular Exponentiation}{chapter.8}
-\BOOKMARK [2][-]{subsection.8.3.1}{Barrett Modular Exponentiation}{section.8.3}
-\BOOKMARK [1][-]{section.8.4}{Quick Power of Two}{chapter.8}
-\BOOKMARK [0][-]{chapter.9}{Higher Level Algorithms}{}
-\BOOKMARK [1][-]{section.9.1}{Integer Division with Remainder}{chapter.9}
-\BOOKMARK [1][-]{section.9.2}{Single Digit Helpers}{chapter.9}
-\BOOKMARK [2][-]{subsection.9.2.1}{Single Digit Addition}{section.9.2}
-\BOOKMARK [2][-]{subsection.9.2.2}{Single Digit Subtraction}{section.9.2}
-\BOOKMARK [2][-]{subsection.9.2.3}{Single Digit Multiplication}{section.9.2}
-\BOOKMARK [2][-]{subsection.9.2.4}{Single Digit Division}{section.9.2}
-\BOOKMARK [2][-]{subsection.9.2.5}{Single Digit Modulo}{section.9.2}
-\BOOKMARK [2][-]{subsection.9.2.6}{Single Digit Root Extraction}{section.9.2}
-\BOOKMARK [1][-]{section.9.3}{Random Number Generation}{chapter.9}
-\BOOKMARK [1][-]{section.9.4}{Formatted Output}{chapter.9}
-\BOOKMARK [2][-]{subsection.9.4.1}{Getting The Output Size}{section.9.4}
-\BOOKMARK [2][-]{subsection.9.4.2}{Generating Radix-n Output}{section.9.4}
-\BOOKMARK [2][-]{subsection.9.4.3}{Reading Radix-n Input}{section.9.4}
-\BOOKMARK [1][-]{section.9.5}{Unformatted Output}{chapter.9}
-\BOOKMARK [2][-]{subsection.9.5.1}{Getting The Output Size}{section.9.5}
-\BOOKMARK [2][-]{subsection.9.5.2}{Generating Output}{section.9.5}
-\BOOKMARK [2][-]{subsection.9.5.3}{Reading Input}{section.9.5}
-\BOOKMARK [0][-]{chapter.10}{Number Theoretic Algorithms}{}
-\BOOKMARK [1][-]{section.10.1}{Greatest Common Divisor}{chapter.10}
-\BOOKMARK [1][-]{section.10.2}{Least Common Multiple}{chapter.10}
-\BOOKMARK [1][-]{section.10.3}{Jacobi Symbol Computation}{chapter.10}
-\BOOKMARK [1][-]{section.10.4}{Modular Inverse}{chapter.10}
-\BOOKMARK [2][-]{subsection.10.4.1}{General Case}{section.10.4}
-\BOOKMARK [2][-]{subsection.10.4.2}{Odd Moduli}{section.10.4}
-\BOOKMARK [1][-]{section.10.5}{Primality Tests}{chapter.10}
-\BOOKMARK [2][-]{subsection.10.5.1}{Trial Division}{section.10.5}
-\BOOKMARK [2][-]{subsection.10.5.2}{The Fermat Test}{section.10.5}
-\BOOKMARK [2][-]{subsection.10.5.3}{The Miller-Rabin Test}{section.10.5}
-\BOOKMARK [2][-]{subsection.10.5.4}{Primality Test in a Bottle}{section.10.5}
-\BOOKMARK [2][-]{subsection.10.5.5}{The Next Prime}{section.10.5}
-\BOOKMARK [1][-]{section.10.6}{Root Extraction}{chapter.10}
-\BOOKMARK [0][-]{appendix*.16}{Appendix}{}
diff --git a/tommath.src b/tommath.src
deleted file mode 100644
index fd11871..0000000
--- a/tommath.src
+++ /dev/null
@@ -1,4675 +0,0 @@
-\documentclass[b5paper]{book}
-\usepackage{hyperref}
-\usepackage{makeidx}
-\usepackage{amssymb}
-\usepackage{color}
-\usepackage{alltt}
-\usepackage{graphicx}
-\usepackage{layout}
-\def\union{\cup}
-\def\intersect{\cap}
-\def\getsrandom{\stackrel{\rm R}{\gets}}
-\def\cross{\times}
-\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
-\def\catn{$\|$}
-\def\divides{\hspace{0.3em} | \hspace{0.3em}}
-\def\nequiv{\not\equiv}
-\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
-\def\lcm{{\rm lcm}}
-\def\gcd{{\rm gcd}}
-\def\log{{\rm log}}
-\def\ord{{\rm ord}}
-\def\abs{{\mathit abs}}
-\def\rep{{\mathit rep}}
-\def\mod{{\mathit\ mod\ }}
-\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
-\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
-\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
-\def\Or{{\rm\ or\ }}
-\def\And{{\rm\ and\ }}
-\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
-\def\implies{\Rightarrow}
-\def\undefined{{\rm ``undefined"}}
-\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
-\let\oldphi\phi
-\def\phi{\varphi}
-\def\Pr{{\rm Pr}}
-\newcommand{\str}[1]{{\mathbf{#1}}}
-\def\F{{\mathbb F}}
-\def\N{{\mathbb N}}
-\def\Z{{\mathbb Z}}
-\def\R{{\mathbb R}}
-\def\C{{\mathbb C}}
-\def\Q{{\mathbb Q}}
-\definecolor{DGray}{gray}{0.5}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
-\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
-\def\gap{\vspace{0.5ex}}
-\makeindex
-\begin{document}
-\frontmatter
-\pagestyle{empty}
-\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - }
-\author{\mbox{
-%\begin{small}
-\begin{tabular}{c}
-Tom St Denis \\
-Algonquin College \\
-\\
-Mads Rasmussen \\
-Open Communications Security \\
-\\
-Greg Rose \\
-QUALCOMM Australia \\
-\end{tabular}
-%\end{small}
-}
-}
-\maketitle
-This text in its entirety is copyright \copyright{}2003 by Tom St Denis.  It may not be redistributed 
-electronically or otherwise without the sole permission of the author.  The text is freely redistributable as long as
-it is packaged along with the LibTomMath library in a non-commercial project.  Contact the
-author for other redistribution rights.
-
-This text corresponds to the v0.17 release of the LibTomMath project.
-
-\begin{alltt}
-Tom St Denis
-111 Banning Rd
-Ottawa, Ontario
-K2L 1C3
-Canada
-
-Phone: 1-613-836-3160
-Email: tomstdenis@iahu.ca
-\end{alltt}
-
-This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
-{\em book} macro package and the Perl {\em booker} package.
-
-\tableofcontents
-\listoffigures
-\chapter*{Preface}
-Blah.
-
-\mainmatter
-\pagestyle{headings}
-\chapter{Introduction}
-\section{Multiple Precision Arithmetic}
-\subsection{The Need for Multiple Precision Arithmetic}
-The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public
-key cryptography.   Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to 
-resist known cryptanalytic attacks.  Typical modern programming languages such as C and Java only provide small 
-single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long.
-
-For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type.  With an 
-x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$.  The original inputs 
-were approximately $21$ and $24$ bits respectively.  If the C language cannot multiply two relatively small values 
-together precisely how does anyone expect it to multiply two values that are considerably larger?
-
-Most advancements in fast multiple precision arithmetic stem from the desire for faster cryptographic primitives.  However, cryptography
-is not the only field of study that can benefit from fast large integer routines.  Another auxiliary use for multiple precision integers is 
-high precision floating point data types.  The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$.  
-Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is specified.  Since IEEE is meant to be implemented in 
-hardware the precision of the mantissa is often fairly small (\textit{23, 48 and 64 bits}).  Since the mantissa is merely an 
-integer a large multiple precision integer could be used.  In effect very high precision floating point arithmetic 
-could be performed.  This would be useful where scientific applications must minimize the total output error over long simulations.  
-
-\subsection{Multiple Precision Arithmetic}
-\index{multiple precision}
-Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from
-the C and Java programming languages.  In essence multiple precision arithmetic is a set of operations that can be 
-performed on members of an algebraic group whose precision is not fixed.  The algorithms when implemented to be multiple
-precision can allow a developer to work with any practical precision required.
-
-Typically the arithmetic over the ring of integers denoted by $\Z$ is performed by routines that are collectively and 
-casually referred to as ``bignum'' routines.  However, it is possible to have rings of polynomials as well typically 
-denoted by $\Z/p\Z \left [ X \right ]$ which could have variable precision (\textit{or degree}).  This text will 
-discuss implementation of the former, however implementing polynomial basis routines should be relatively easy after reading this text.
-
-\subsection{Benefits of Multiple Precision Arithmetic}
-\index{precision} \index{accuracy}
-Precision of the real value to a given precision is defined loosely as the proximity of the real value to a given representation.  
-Accuracy is defined as the reproducibility of the result.  For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided 
-it is reproducible.
-
-The benefit of multiple precision representations over single precision representations is that 
-often no precision is lost while representing the result of an operation which requires excess precision.  For example, 
-the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result.  A multiple precision 
-system would augment the precision of the destination to accomodate the result while a single precision system would
-truncate excess bits to maintain a fixed level of precision.
-
-Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of
-modest computer resources.  The only reasonable case where a multiple precision system will lose precision is when
-emulating a floating point data type.  However, with multiple precision integer arithmetic no precision is lost.
-
-\subsection{Basis of Operations}
-At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learned as children 
-in grade school.  For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for 
-$1,234$, instead they are taught how to long-multiply.  That is to multiply each column using simple single digit 
-multiplications, line up the partial results, and add the resulting products by column.  The representation that most 
-are familiar with is known as decimal or formally as radix-10. A radix-$n$ representation simply means there are 
-$n$ possible values per digit.  For example, binary would be a radix-2 representation.
-
-In essence computer based multiple precision arithmetic is very much the same.  The most notable difference is the usage
-of a binary friendly radix.  That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine 
-register.  Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and 
-squaring instead of traditional long-hand algorithms.
-
-\section{Purpose of This Text}
-The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms.  That is 
-to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by
-authors of other texts on the subject.  Texts such as \cite[HAC]{HAC} and \cite{TAOCPV2} give considerably detailed 
-explanations of the theoretical aspects of the algorithms and very little regarding the practical aspects.  
-
-How an algorithm is explained and how it is actually implemented are two very different 
-realities.  For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple 
-precision integer addition.  However, what the description lacks is any discussion concerning the fact that the two 
-integer inputs may be of differing magnitudes.  Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) 
-does not discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{Step \#3}).
-
-As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required 
-such as ``Comba'' and Karatsuba multipliers and fast modular inversion.  These optimal algorithms are vital to achieve 
-any form of useful performance in non-trivial applications.  
-
-To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that 
-constitute a multiple precision integer package with light discussions on the theoretical aspects.  As a case 
-study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate 
-algorithms with implementations that have been field tested and work very well.
-
-\section{Discussion and Notation}
-\subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the 
-multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$.  The elements of the array $x$ are
-said to be the radix $\beta$ digits of the integer.  For example, $x = (1,2,3)_{10}$ would represent the 
-integer $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
-
-A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data
-required to manipulate the data.  These additional members are discussed in ~BASICOP~.  For the purposes of this text
-a ``multiple precision integer'' and a ``mp\_int'' are synonymous.
-
-\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word}
-For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while
-a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$.  Within the source code that will be
-presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a 
-double-precision type.  In several algorithms (\textit{notably the Comba routines}) temporary results 
-will be stored in a double-precision arrays.  For the purposes of this text $x_j$ will refer to the 
-$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision
-array.
-
-The $\lfloor \mbox{ } \rfloor$ brackets represent a value truncated and rounded down to the nearest integer.  The $\lceil \mbox{ } \rceil$ brackets 
-represent a value truncated and rounded up to the nearest integer.  Typically when the $/$ division symbol is used the intention is to perform an integer
-division.  For example, $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When a value is presented as a fraction
-such as $5 \over 2$ a real value division is implied.
-
-\subsection{Work Effort}
-\index{big-O}
-To measure the efficiency of various algorithms a modified big-O notation is used.  In this system all 
-single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
-That is a single precision addition, multiplication and division are assumed to take the same time to 
-complete.  While this is generally not true in practice it will simplify the discussions considerably.
-
-Some algorithms have slight advantages over others which is why some constants will not be removed in 
-the notation.  For example, a normal multiplication requires $O(n^2)$ work while a squaring requires 
-$O({{n^2 + n}\over 2})$ work.  In standard big-O notation these would be said to be equivalent.  However, in the 
-context of the this text the magnitude of the inputs will not approach an infinite size.  This means the conventional limit 
-notation wisdom does not apply to the cancellation of constants.
-
-Throughout the discussions various ``work levels'' will be discussed.  These levels are the $O(1)$,
-$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts.  For example, operations at the $O(n^k)$ ``level'' are said to be
-executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$.  Obviously most optimizations will pay
-off the most at the higher levels since they represent the bulk of the effort required.  
-
-\section{Exercises}
-Within the more advanced chapters a section will be set aside to give the reader some challenging exercises.  These exercises are not 
-designed to be prize winning problems, but to be thought provoking.  Wherever possible the problems are forward minded stating 
-problems that will be answered in subsequent chapters.  The reader is encouraged to finish the exercises as they appear to get a 
-better understanding of the subject material.  
-
-Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system.  However, unlike 
-\cite{TAOCPV2} the problems do not get nearly as hard as often.  The scoring of these exercises ranges from one (\textit{the easiest}) to
-five (\textit{the hardest}).  The following table sumarizes the scoring.
-
-\vspace{5mm}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
-                     & minutes to solve.  Usually does not involve much computer time. \\
-                     & \\
-$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
-                     & time usage.  Usually requires a program to be written to \\
-                     & solve the problem. \\
-                     & \\
-$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
-                     & of work.  Usually involves trivial research and development of \\
-                     & new theory from the perspective of a student. \\
-                     & \\
-$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
-                     & of work and research.  The solution to which will demonstrate \\
-                     & a higher mastery of the subject matter. \\
-                     & \\
-$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial.  \\
-                     & Solutions to these problems will demonstrate a complete mastery \\
-                     & of the given subject. \\
-                     & \\
-\end{tabular}
-
-Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
-devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level are also
-designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  
-
-Problems at the third level are meant to be a bit more difficult.  Often the answer is fairly obvious but arriving at an exacting solution
-requires some thought and skill.  These problems will almost always involve devising a new algorithm or implementing a variation of
-another algorithm.
-
-Problems at the fourth level are meant to be even more difficult as well as involve some research.  The reader will most likely not know
-the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}).  Problems
-at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter.  People who can correctly 
-answer fifth level problems have a mastery of the subject matter at hand.
-
-Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
-is encouraged to answer the follow-up problems and try to draw the relevence of problems.
-
-\chapter{Introduction to LibTomMath}
-
-\section{What is LibTomMath?}
-LibTomMath is a free and open source multiple precision library written in portable ISO C source code.  By portable it is 
-meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on any 
-given platform.  The library has been successfully tested under numerous operating systems including Solaris, MacOS, Windows, 
-Linux, PalmOS and on standalone hardware such as the Gameboy Advance.  The library is designed to contain enough 
-functionality to be able to develop applications such as public key cryptosystems.
-
-\section{Goals of LibTomMath}
-
-Even though the library is written entirely in portable ISO C considerable care has been taken to 
-optimize the algorithm implementations within the library.  Specifically the code has been written to work well with
-the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors.  Wherever possible highly efficient 
-algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction}) have 
-been provided to make the library as efficient as possible.  Even with the optimal and sometimes specialized 
-algorithms that have been included the Application Programing Interface (\textit{API}) has been kept as simple as possible.  
-Often generic place holder routines will make use of specialized algorithms automatically without the developer's
-attention.  One such example is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use 
-Karatsuba multiplication if the inputs are of a specific size.
-
-Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
-be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
-MPI library was used as a API template for all the basic functions.
-
-The project is also meant to act as a learning tool for students.  The logic being that no easy-to-follow ``bignum'' 
-library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
-arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  Often routines have 
-more comments than lines of code.
-
-\section{Choice of LibTomMath}
-LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
-for more worthy reasons.  Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision 
-integer arithmetic routines but would not be ideal for this text for reasons as will be explained in the 
-following sub-sections.
-
-\subsection{Code Base}
-The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
-segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
-developer can more readily ascertain the true intent of a given section of source code without trying to keep track of
-what conditional code will be used.
-
-The code base of LibTomMath is also well organized.  Each function is in its own separate source code file 
-which allows the reader to find a given function very fast.  When compiled with GCC for the x86 processor the entire 
-library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}).  This includes every single function 
-LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various 
-reduction algorithms and Jacobi symbol computation.  
-
-By comparison MPI which has fewer functions than LibTomMath compiled with the same conditions is 45,429 bytes 
-(\textit{$54,536$ for ARMv4}).  GMP which has rather large collection of functions with the default configuration on an 
-x86 Athlon is 2,950,688 bytes.  Note that while LibTomMath has fewer functions than GMP it has been used as the sole basis 
-for several public key cryptosystems without having to seek additional outside functions to supplement the library.
-
-\subsection{API Simplicity}
-LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
-with LibTomMath without change. The function names are relatively straight forward as to what they perform.  Almost all of the 
-functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing 
-convention.  The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the 
-student and developer alike.  
-
-The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
-illegible short hand.  LibTomMath does not share this fault.
-
-\subsection{Optimizations}
-While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does
-feature a set of optimal algorithms for tasks ranging from modular reduction to squaring.  GMP and LIP also feature
-such optimizations while MPI only uses baseline algorithms with no optimizations.
-
-LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
-exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
-slower than the best libraries such as GMP and OpenSSL by a small factor.
-
-\subsection{Portability and Stability}
-LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
-(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
-variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
-MPI is not working on his library anymore.  
-
-GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
-development and are very stable across a variety of platforms.
-
-\subsection{Choice}
-LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
-the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, the 
-reader is encouraged to download their own copy of the library to actually be able to work with the library.  
-
-\chapter{Getting Started}
-MARK,BASICOP
-\section{Library Basics}
-To begin the design of a multiple precision integer library a primitive data type and a series of primitive algorithms must be established.  A data
-type that will hold the information required to maintain a multiple precision integer must be designed.  With this basic data type of a series
-of low level algorithms for initializing, clearing, growing and optimizing multiple precision integers can be developed to form the basis of 
-the entire library of algorithms.
-
-\section{What is a Multiple Precision Integer?}
-Recall that most programming languages (\textit{in particular C}) only have fixed precision data types that on their own cannot be used
-to represent values larger than their precision alone will allow. The purpose of multiple precision algorithms is to use these fixed precision
-data types to create multiple precision integers which may represent values that are much larger.  
-
-As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
-the largest value is only $9$ since the digits may only have values from $0$ to $9$.  However, by concatenating digits together larger numbers 
-may be represented.  Computer based multiple precision arithmetic is essentially the same concept except with a different radix.
-
-What most people probably do not think about explicitly are the various other attributes that describe a multiple precision integer.  For example,
-the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, that is the sign of this particular integer 
-is positive as oppose to negative.  Second, the integer has three digits in its representation.  There is an additional property that the integer 
-posesses that does not concern pencil-and-paper arithmetic.  The third property is how many digits are allowed for the integer.  
-
-The human analogy of this third property is ensuring there is enough space on the paper to right the integer.  Computers must maintain a
-strict control on memory usage with respect to the digits of a multiple precision integer.  These three properties make up what is known
-as a multiple precision integer or mp\_int for short.  
-
-\subsection{The mp\_int structure}
-The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
-any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
-used within LibTomMath.
-
-\index{mp\_int}
-\begin{verbatim}
-typedef struct  {
-    int used, alloc, sign;
-    mp_digit *dp;
-} mp_int;
-\end{verbatim}
-
-The mp\_int structure can be broken down as follows.
-
-\begin{enumerate}
-\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
-a given integer.  The \textbf{used} count must not exceed the \textbf{alloc} count.  
-
-\item The array \textbf{dp} holds the digits that represent the given integer.  It is padded with $\textbf{alloc} - \textbf{used}$ zero
-digits.
-
-\item The \textbf{alloc} parameter denotes how 
-many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
-of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
-array to accommodate the precision of the result.  
-
-\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
-\end{enumerate}
-
-\section{Argument Passing}
-A convention of argument passing must be adopted early on in the development of any library.  Making the function prototypes
-consistent will help eliminate many headaches in the future as the library grows to significant complexity.  In LibTomMath the multiple precision 
-integer functions accept parameters from left to right as pointers to mp\_int structures.  That means that the source operands are 
-placed on the left and the destination on the right.   Consider the following examples.
-
-\begin{verbatim}
-   mp_mul(&a, &b, &c);   /* c = a * b */
-   mp_add(&a, &b, &a);   /* a = a + b */
-   mp_sqr(&a, &b);       /* b = a * a */
-\end{verbatim}
-
-The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
-functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
-
-Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around.  That is the destination
-on the left and arguments on the right.  In truth it is entirely a matter of preference.  In the case of LibTomMath the 
-convention from the MPI library has been adopted.  
-
-Another very useful design consideration is whether to allow argument sources to also be a destination.  For example, the
-second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important feature to implement since it
-allows the higher up functions to cut down on the number of variables.  However, to implement this feature specific
-care has to be given to ensure the destination is not modified before the source is fully read.
-
-\section{Return Values}
-A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the 
-caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  In a multiple precision 
-library the only errors that can occur occur are related to inappropriate inputs (\textit{division by zero for instance}) or 
-memory allocation errors.
-
-In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the 
-following values.
-
-\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Value} & \textbf{Meaning} \\
-\hline \textbf{MP\_OKAY} & The function was successful \\
-\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
-\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
-\hline
-\end{tabular}
-\end{center}
-
-When an error is detected within a function it should free any memory it allocated and return as soon as possible.  The goal
-is to leave the system in the same state the system was when the function was called.  Error checking with this style of API is fairly simple.
-
-\begin{verbatim}
-   int err;
-   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
-      printf("Error: %d\n", err);
-      exit(EXIT_FAILURE);
-   }
-\end{verbatim}
-
-The GMP library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
-and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
-
-\section{Initialization and Clearing}
-The logical starting point when actually writing multiple precision integer functions is the initialization and 
-clearing of the integers.  These two functions will be used by far the most throughout the algorithms whenever 
-temporary integers are required.
-
-Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
-the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even considering
-the initial integer will represent zero.  If only a single digit were allocated quite a few re-allocations
-would occur for the majority of inputs.  There is a tradeoff between how many default digits to allocate
-and how many re-allocations are tolerable.  
-
-If the memory for the digits has been successfully allocated then the rest of the members of the structure must
-be initialized.  Since the initial state is to represent a zero integer the digits allocated must all be zeroed.  The
-\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
-
-\subsection{Initializing an mp\_int}
-To initialize an mp\_int the mp\_init algorithm shall be used.  The purpose of this algorithm is to allocate 
-the memory required and initialize the integer to a default representation of zero.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Allocate memory for the digits and set to a zero state. \\
-\hline \\
-1.  Allocate memory for \textbf{MP\_PREC} digits. \\
-2.  If the allocation failed then return(\textit{MP\_MEM}) \\
-3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$\\
-4.  $a.sign \leftarrow MP\_ZPOS$\\
-5.  $a.used \leftarrow 0$\\
-6.  $a.alloc \leftarrow MP\_PREC$\\
-7.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init}
-\end{figure}
-
-\textbf{Algorithm mp\_init.}
-The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers.  It is ideally at least equal to $32$ but 
-can be any reasonable power of two.  Steps one and two allocate the memory and account for it.  If the allocation fails the algorithm returns
-immediately to signal the failure.  Step three will ensure that all the digits are in the default state of zero.  Finally steps 
-four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure.
-
-EXAM,bn_mp_init.c
-
-The \textbf{OPT\_CAST} type cast on line @22,OPT_CAST@ is designed to allow C++ compilers to build the code out of
-the box.  Microsoft C V5.00 is known to cause problems without the cast.  Also note that if the memory
-allocation fails the other members of the mp\_int will be in an undefined state.  The code from 
-line @29,a->used@ to line @31,a->sign@ sets the default state for a mp\_int which is zero, positive and no used digits.
-
-\subsection{Clearing an mp\_int}
-When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with 
-the mp\_clear algorithm.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clear}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ is cleared. \\
-\hline \\
-1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
-2.  Free the digits of $a$ and mark $a$ as freed. \\
-3.  $a.used \leftarrow 0$ \\
-4.  $a.alloc \leftarrow 0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clear}
-\end{figure}
-
-\textbf{Algorithm mp\_clear.}
-In steps one and two the memory for the digits are only free'd if they had not been previously released before.  
-This is more of concern for the implementation since it is used to prevent ``double-free'' errors.  It also helps catch
-code errors where mp\_ints are used after being cleared.  Similarly steps three and four set the 
-\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging.  For example, if an mp\_int is expected
-to be non-zero and its \textbf{used} member is observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been
-spotted.
-
-EXAM,bn_mp_clear.c
-
-The \textbf{if} statement on line @21,a->dp != NULL@ prevents the heap from being corrupted if a user double-frees an 
-mp\_int.  For example, a trivial case of this bug would be as follows.
-
-\begin{verbatim}
-mp_int a;
-mp_init(&a);
-mp_clear(&a);
-mp_clear(&a);
-\end{verbatim}
-
-Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C
-libraries to cause a fault.  Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently 
-free the mp\_int before it is truly not needed.  The allocated digits are set to zero before being freed on line @24,memset@.  
-This is ideal for cryptographic situations where the mp\_int is a secret parameter.
-
-The following snippet is an example of using both the init and clear functions.  
-
-\begin{small}
-\begin{verbatim}
-#include <tommath.h>
-#include <stdio.h>
-#include <stdlib.h>
-int main(void)
-{
-   mp_int num;
-   int err;
-   
-   /* init the bignum */
-   if ((err = mp_init(&num)) != MP_OKAY) {
-      printf("Error: %d\n", err);
-      return EXIT_FAILURE;
-   }
-   
-   /* do work with it ... */
-   
-   /* clear up */
-   mp_clear(&num);
-   
-   return EXIT_SUCCESS;
-}
-\end{verbatim}
-\end{small}
-
-\section{Other Initialization Routines}
-
-It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms.  For example, an 
-initialization followed by a copy is a common operation when temporary copies of integers are required.  It is quite
-beneficial to have a series of simple helper functions available.
-
-\subsection{Initializing Variable Sized mp\_int Structures}
-Occasionally the number of digits required will be known in advance of an initialization.  In these
-cases the mp\_init\_size algorithm can be of use.  The purpose of this algorithm is similar to mp\_init except that 
-it will allocate \textit{at least} a specified number of digits.  This is ideal to prevent re-allocations when the 
-input size is known.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_size}. \\
-\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$\\
-\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
-\hline \\
-1.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-3.  Allocate $v$ digits. \\
-4.  If the allocation failed then return(\textit{MP\_MEM}). \\
-5.  for $n$ from $0$ to $v - 1$ do \\
-\hspace{3mm}5.1  $a_n \leftarrow 0$ \\
-6.  $a.sign \leftarrow MP\_ZPOS$\\
-7.  $a.used \leftarrow 0$\\
-8.  $a.alloc \leftarrow v$\\
-9.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_size}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_size.}
-The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding.  The padding is calculated
-to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}.  This padding is used to 
-prevent trivial allocations from becoming a bottleneck in the rest of the algorithms that depend on this.
-
-EXAM,bn_mp_init_size.c
-
-Line @23,MP_PREC@ will ensure that the number of digits actually allocated is padded up to the next multiple of 
-\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}.  This ensures that the number of allocated digit is 
-always greater than the amount requested.  As a result it prevents many trivial memory allocations.  The value of 
-\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two.
-
-\subsection{Creating a Clone}
-Another common sequence of operations is to make a local temporary copy of an argument.  To initialize then copy a mp\_int will be known as 
-creating a clone.  This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy.  
-The mp\_init\_copy algorithm will perform this very task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_copy}. \\
-\textbf{Input}.   An mp\_int $a$ and $b$\\
-\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
-\hline \\
-1.  Init $a$.  (\textit{mp\_init}) \\
-2.  If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\
-3.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
-4.  Return the status of the copy operation. \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_copy.}
-This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it.  The algorithm will
-detect when the initialization fails and returns the error to the calling algorithm.  As such this algorithm will perform two operations
-in one step.  
-
-EXAM,bn_mp_init_copy.c
-
-This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
-\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
-and \textbf{a} will be left intact.  
-
-\subsection{Multiple Integer Initializations And Clearings}
-Occasionally a function will require a series of mp\_int data types to be made available.  The mp\_init\_multi algorithm
-is provided to simplify such cases.  The purpose of this algorithm is to initialize a variable length array of mp\_int 
-structures at once.  As a result algorithms that require multiple integers only has to use 
-one algorithm to initialize all the mp\_int variables.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_multi}. \\
-\textbf{Input}.   Variable length array of mp\_int variables of length $k$. \\
-\textbf{Output}.  The array is initialized such that each each mp\_int is ready to use. \\
-\hline \\
-1.  for $n$ from 0 to $k - 1$ do \\
-\hspace{+3mm}1.1.  Initialize the $n$'th mp\_int (\textit{mp\_init}) \\
-\hspace{+3mm}1.2.  If initialization failed then do \\
-\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
-\hspace{+9mm}1.2.1.1.  Free the $j$'th mp\_int (\textit{mp\_clear}) \\
-\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
-2.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_multi}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_multi.}
-The algorithm will initialize the array of mp\_int variables one at a time.  As soon as an runtime error is detected (\textit{step 1.2}) all of
-the previously initialized variables are cleared.  The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime 
-errors.
-
-Similarly to clear a variable length array of mp\_int structures the mp\_clear\_multi algorithm will be used.
-
-Consider the following snippet which demonstrates how to use both routines.
-\begin{small}
-\begin{verbatim}
-#include <tommath.h>
-#include <stdio.h>
-#include <stdlib.h>
-int main(void)
-{
-   mp_int num1, num2, num3;
-   int err;
-   
-   if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) {
-      printf("Error: %d\n", err);
-      return EXIT_FAILURE;
-   }
-   
-   /* at this point num1/num2/num3 are ready */
-   
-   /* free them */
-   mp_clear_multi(&num1, &num2, &num3, NULL);
-   
-   return EXIT_SUCCESS;
-}
-\end{verbatim}
-\end{small}
-
-Note how both lists are terminated with the \textbf{NULL} variable.  This indicates to the algorithms to stop fetching parameters off
-of the stack.  If it is not present the functions will most likely cause a segmentation fault.  
-
-EXAM,bn_mp_multi.c
-
-Both routines are implemented in the same source file since they are typically used in conjunction with each other.  
-
-\section{Maintenance}
-A small useful collection of mp\_int maintenance functions will also prove useful.  
-
-\subsection{Augmenting Integer Precision}
-When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without
-loss of precision.  Quite often the size of the array given by the \textbf{alloc} member is large enough to simply
-increase the \textbf{used} digit count.  However, when the size of the array is too small it must be re-sized 
-appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_grow}. \\
-\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
-\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
-\hline \\
-1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
-2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-Allocate the array of digits $a$ to size $v$ \\
-5.  If the allocation failed then return(\textit{MP\_MEM}). \\
-6.  for n from a.alloc to $v - 1$ do  \\
-\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.alloc \leftarrow v$ \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_grow}
-\end{figure}
-
-\textbf{Algorithm mp\_grow.}
-Step one will prevent a re-allocation from being performed if it was not required.  This is useful to prevent mp\_ints
-from growing excessively in code that erroneously calls mp\_grow.  Similar to mp\_init\_size the requested digit count
-is padded to provide more digits than requested.  
-
-In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact.  This is much akin to how the 
-\textit{realloc} function from the standard C library works.  Since the newly allocated digits are assumed to contain
-undefined values they are also initially zeroed.
-
-EXAM,bn_mp_grow.c
-
-The first step is to see if we actually need to perform a re-allocation at all.  This is tested for on line 
-@24,a->alloc < size@.  Similar to mp\_init\_size the same code on line @26,MP_PREC - 1@ was used to resize the 
-digits requested.  A simple for loop from line @34,a->alloc@ to line @38,}@ will zero all digits that were above the 
-old \textbf{alloc} limit to make sure the integer is in a known state.
-
-\subsection{Clamping Excess Digits}
-When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
-the function.  For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most 
-$i + j$ digits.  It is entirely possible that the result is $i + j - 1$ though, with no final carry into the last 
-position.  However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j - 1$
-digits than further expanded to accomodate the final carry.  That would be a considerable waste of time since heap
-operations are relatively slow.
-
-The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
-terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
-there would be an excess high order zero digit.  
-
-For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
-will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
-accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
-low the representation is excessively large.  
-
-The mp\_clamp algorithm is designed to solve this very problem.  It will trim leading zeros by decrementing the 
-\textbf{used} count until a non-zero leading digit is found.  Also in this system, zero is considered to be a positive 
-number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clamp}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
-\hline \\
-1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
-\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
-2.  if $a.used = 0$ then do \\
-\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
-\hline \\
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clamp}
-\end{figure}
-
-\textbf{Algorithm mp\_clamp.}
-As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
-the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
-when all of the digits are zero to ensure that the mp\_int is valid at all times.
-
-EXAM,bn_mp_clamp.c
-
-Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
-language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
-important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
-undesirable.  The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not
-the pointer ``a''.  
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
-                     & \\
-$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
-                     & encryption when $\beta = 2^{28}$.  \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
-                     & \\
-$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
-                     & \\
-\end{tabular}
-
-
-\chapter{Basic Operations}
-\section{Copying an Integer}
-After the various house-keeping routines are in place, simple algorithms can be designed to take advantage of them.  Being able
-to make a verbatim copy of an integer is a very useful function to have.  To copy an integer the mp\_copy algorithm will be used.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_copy}. \\
-\textbf{Input}.  An mp\_int $a$ and $b$. \\
-\textbf{Output}.  Store a copy of $a$ in $b$. \\
-\hline \\
-1.  Check if $a$ and $b$ point to the same location in memory. \\
-2.  If true then return(\textit{MP\_OKAY}). \\
-3.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
-4.  If failed to grow then return(\textit{MP\_MEM}). \\
-5.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}5.1  $b_{n} \leftarrow a_{n}$ \\
-6.  if $a.used < b.used - 1$ then \\ 
-\hspace{3mm}6.1.  for $n$ from $a.used$ to $b.used - 1$ do \\
-\hspace{6mm}6.1.1  $b_{n} \leftarrow 0$ \\
-7.  $b.used \leftarrow a.used$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_copy.}
-Step 1 and 2 make sure that the two mp\_ints are unique.  This allows the user to call the copy function with
-potentially the same input and not waste time.  Step 3 and 4 ensure that the destination is large enough to
-hold a copy of the input $a$.  Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used}
-member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller.  This
-prevents trivial memory reallocations.
-
-Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$,
-the more significant digits of $b$ will be zeroed.  Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over 
-which completes the copy operation.
-
-EXAM,bn_mp_copy.c
-
-Source lines @23,if dst ==@-@31,}@ do the initial house keeping.  That is to see if the input is unique and if so to 
-make sure there is enough room.  If not enough space is available it returns the error and leaves the destination variable
-intact.
-
-The inner loop of the copy operation is contained between lines @34,{@ and @50,}@.  Many LibTomMath routines are designed with this source code style
-in mind, making aliases to shorten lengthy pointers (\textit{see line @38,->@ and @39,->@}) for rapid use.  Also the
-use of nested braces creates a simple way to denote various portions of code that reside on various work levels.  Here, the copy loop is at the 
-$O(n)$ level.  
-
-\section{Zeroing an Integer}
-Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
-perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_zero}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Zero the contents of $a$ \\
-\hline \\
-1.  $a.used \leftarrow 0$ \\
-2.  $a.sign \leftarrow$ MP\_ZPOS \\
-3.  for $n$ from 0 to $a.alloc - 1$ do \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_zero}
-\end{figure}
-
-\textbf{Algorithm mp\_zero.}
-This algorithm simply resets a mp\_int to the default state.  
-
-EXAM,bn_mp_zero.c
-
-After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
-\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
-
-\section{Sign Manipulation}
-\subsection{Absolute Value}
-With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
-the absolute value of an mp\_int.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_abs}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = \vert a \vert$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  $b.sign \leftarrow MP\_ZPOS$ \\
-4.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_abs}
-\end{figure}
-
-\textbf{Algorithm mp\_abs.}
-This algorithm computes the absolute of an mp\_int input.  As can be expected the algorithm is very trivial.
-
-EXAM,bn_mp_abs.c
-
-\subsection{Integer Negation}
-With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
-the negative of an mp\_int input.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_neg}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = -a$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  If $a.sign = MP\_ZPOS$ then do \\
-\hspace{3mm}3.1  $b.sign = MP\_NEG$. \\
-4.  else do \\
-\hspace{3mm}4.1  $b.sign = MP\_ZPOS$. \\
-5.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_neg}
-\end{figure}
-
-\textbf{Algorithm mp\_neg.}
-This algorithm computes the negation of an input.  
-
-EXAM,bn_mp_neg.c
-
-\section{Small Constants}
-\subsection{Setting Small Constants}
-Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set}. \\
-\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}). \\
-2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
-3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
-                              1 &  \mbox{if }a_0 > 0 \\
-                              0 &  \mbox{if }a_0 = 0 
-                              \end{array} \right .$ \\
-\hline                              
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set}
-\end{figure}
-
-\textbf{Algorithm mp\_set.}
-This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
-single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
-
-EXAM,bn_mp_set.c
-
-Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign.  Line @22,MP_MASK@ copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line @23,a->used@ will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
-
-One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
-this function should take that into account.  Meaning that only trivially small constants can be set using this function.
-
-\subsection{Setting Large Constants}
-To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided.  It accepts a ``long''
-data type as input and will always treat it as a 32-bit integer.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set\_int}. \\
-\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}) \\
-2.  for $n$ from 0 to 7 do \\
-\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
-\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
-\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
-\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
-3.  Clamp excess used digits (\textit{mp\_clamp}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set\_int}
-\end{figure}
-
-\textbf{Algorithm mp\_set\_int.}
-The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
-mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
-next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
-incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
-zero digits used and the newly added four bits would be ignored.
-
-Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
-
-EXAM,bn_mp_set_int.c
-
-This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
-addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits.  While it may not 
-seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ 
-as well as the  call to mp\_clamp() on line @40,mp_clamp@.  Both functions will clamp excess leading digits which keeps 
-the number of used digits low.
-
-\section{Comparisons}
-\subsection{Unsigned Comparisions}
-Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
-to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
-to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
-positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
-
-The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
-mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
-signs are known to agree in advance.
-
-To facilitate working with the results of the comparison functions three constants are required.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|r|l|}
-\hline \textbf{Constant} & \textbf{Meaning} \\
-\hline \textbf{MP\_GT} & Greater Than \\
-\hline \textbf{MP\_EQ} & Equal To \\
-\hline \textbf{MP\_LT} & Less Than \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Comparison Return Codes}
-\end{figure}
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp\_mag}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
-\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
-\hline \\
-1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
-2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
-3.  for n from $a.used - 1$ to 0 do \\
-\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
-\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
-4.  Return(\textit{MP\_EQ}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp\_mag}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp\_mag.}
-By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
-\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
-Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
-If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
-
-By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
-the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
-
-EXAM,bn_mp_cmp_mag.c
-
-The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
-
-\subsection{Signed Comparisons}
-Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
-comparison a trivial signed comparison algorithm can be written.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
-\hline \\
-1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
-2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
-3.  if $a.sign = MP\_NEG$ then \\
-\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
-4   Otherwise \\
-\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp.}
-The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
-comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
-three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
-$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
-
-EXAM,bn_mp_cmp.c
-
-The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line @30,if@, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}).  Otherwise, the signs are assumed to 
-be both positive and a forward direction unsigned comparison is performed.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
-                     & \\
-$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
-                     & of two random digits (of equal magnitude) before a difference is found. \\
-                     & \\
-$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
-                     & on the observations made in the previous problem. \\
-                     &
-\end{tabular}
-
-\chapter{Basic Arithmetic}
-\section{Building Blocks}
-At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
-established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
-algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
-that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
-which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
-
-MARK,SHIFTS
-All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
-logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
-number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}).  
-Mathematically a logical shift is equivalent to a division or multiplication by a power of two.  
-For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
-
-One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
-from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
-result is $110_2$.  
-
-\section{Addition and Subtraction}
-In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
-$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
-As a result subtraction can be performed with a trivial series of logical operations and an addition.
-
-However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
-sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
-subtraction algorithms with the sign fixed up appropriately.
-
-The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
-the integers respectively.
-
-\subsection{Low Level Addition}
-An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
-trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
-Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
-
-\newpage
-\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
-\hline \\
-1.  if $a.used > b.used$ then \\
-\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
-\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
-\hspace{+3mm}1.3  $x   \leftarrow a$ \\
-2.  else  \\
-\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
-\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
-\hspace{+3mm}2.3  $x   \leftarrow b$ \\
-3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
-4.  If failed to grow $c$ return(\textit{MP\_MEM}) \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow max + 1$ \\
-7.  $u \leftarrow 0$ \\
-8.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{+3mm}8.1  $c_n \leftarrow a_n + b_n + u$ \\
-\hspace{+3mm}8.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  if $min \ne max$ then do \\
-\hspace{+3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{+6mm}9.1.1  $c_n \leftarrow x_n + u$ \\
-\hspace{+6mm}9.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-10.  $c_{max} \leftarrow u$ \\
-11.  if $olduse > max$ then \\
-\hspace{+3mm}11.1  for $n$ from $max + 1$ to $olduse - 1$ do \\
-\hspace{+6mm}11.1.1  $c_n \leftarrow 0$ \\
-12.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
-13.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_add}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_add.}
-This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
-Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
-MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
-
-Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count.  This allows the inputs to have varying magnitudes which not 
-only makes it more efficient than the trivial algorithm presented in the references but more flexible.  The variable $min$ is given the lowest 
-digit count while $max$ is given the highest digit count.  If both inputs have the same \textbf{used} digit count both $min$ and $max$ are 
-set to the same value.  The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it.  After the inputs are sorted, 
-steps 3 and 4 will ensure that the destination $c$ can accommodate the result.  The old \textbf{used} count from $c$ is copied to 
-$oldused$ so that excess digits can be cleared later, and the new \textbf{used} count is set to $max+1$, so that a carry from the most significant 
-word can be handled.
-
-At step 7 the carry variable $u$ is set to zero and the first part of the addition loop can begin.  The first step of the loop (\textit{8.1}) adds
-digits from the two inputs together along with the carry variable $u$.  The following step extracts the carry bit by shifting the result of the
-preceding step right by $lg(\beta)$ positions.  The shift to extract the carry is similar to how carry extraction works with decimal addition.
-
-Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$.  The trailing digit of the result
-is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$.  The
-division and multiplication of $10$ is simply a logical right or left shift, respectively, of the digits.  In otherwords the carry can be extracted
-by shifting one digit to the right.
-
-Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$.  This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ 
-digit.  Therefore, a logical shift right of the summand by $lg(\beta)$ will extract the carry.  The final step of the loop reduces the digit 
-modulo the radix $\beta$ to ensure it is in range.
-
-After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted.  Step 9 decides whether
-the inputs were of equal magnitude.  If not than another loop similar to that in step 8, must be executed.  The loop at step
-number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry.  
-
-Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$.  Step 11 ensures that 
-leading digits that were originally present in $c$ are cleared.  Finally excess leading digits are clamped and the algorithm returns success.
-
-EXAM,bn_s_mp_add.c
-
-Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines @37,init@ to @42,}@ ensure that the destination is grown to 
-accomodate the result of the addition. 
-
-Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
-lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively.  These aliases are used to ensure the
-compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
-
-The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line @66,for@ and ends on line @75,}@.  Similarly the conditional addition loop
-begins on line @81,for@ and ends on line @90,}@.  The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@.  
-Note the ``++'' operator on the same line.  After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero.
-
-\subsection{Low Level Subtraction}
-The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
-unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
-be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
-This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
-
-MARK,GAMMA
-
-For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
-the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
-this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
-mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
-
-For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
-\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
-\hline \\
-1.  $min \leftarrow b.used$ \\
-2.  $max \leftarrow a.used$ \\
-3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
-4.  If the reallocation failed return(\textit{MP\_MEM}). \\
-5.  $oldused \leftarrow c.used$ \\ 
-6.  $c.used \leftarrow max$ \\
-7.  $u \leftarrow 0$ \\
-8.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{3mm}8.1  $c_n \leftarrow a_n - b_n - u$ \\
-\hspace{3mm}8.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  if $min < max$ then do \\
-\hspace{3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{6mm}9.1.1  $c_n \leftarrow a_n - u$ \\
-\hspace{6mm}9.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-10. if $oldused > max$ then do \\
-\hspace{3mm}10.1  for $n$ from $max$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
-12. Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_sub}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sub.}
-This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
-passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
-algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
-of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
-
-The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
-set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
-most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
-set to the maximal count for the operation.
-
-The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
-subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
-loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
-
-For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
-the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
-third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
-way to the most significant bit.  
-
-Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
-significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
-is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
-carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
-
-If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
-10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
-
-EXAM,bn_s_mp_sub.c
-
-Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for 
-$a$, $b$ and $c$ respectively.
-
-The first subtraction loop occurs on lines @47,u = 0@ through @61,}@.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line @57, >>@}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
-
-\subsection{High Level Addition}
-Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
-established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
-types.  
-
-Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
-flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed addition $c = a + b$. \\
-\hline \\
-1.  if $a.sign = b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
-3.  If any of the lower level operations failed return(\textit{MP\_MEM}) \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_add}
-\end{figure}
-
-\textbf{Algorithm mp\_add.}
-This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from either \cite{TAOCPV2} or 
-\cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly straightforward but restricted since subtraction can only 
-produce positive results.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&&\\
-
-\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
-\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
-
-\hline &&&&\\
-
-\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Addition Guide Chart}
-\label{fig:AddChart}
-\end{figure}
-
-Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled.  The 
-return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors.  This simplifies the description
-of the algorithm considerably and best follows how the implementation actually was achieved.
-
-Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
-s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
-to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.  
-
-For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
-produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
-within algorithm s\_mp\_add will force $-0$ to become $0$.  
-
-EXAM,bn_mp_add.c
-
-The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
-is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
-explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
-level functions do so.  Returning their return code is sufficient.
-
-\subsection{High Level Subtraction}
-The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed subtraction $c = a - b$. \\
-\hline \\
-1.  if $a.sign \ne b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
-3.  If any of the lower level operations failed return(\textit{MP\_MEM}). \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_sub}
-\end{figure}
-
-\textbf{Algorithm mp\_sub.}
-This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
-\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  The following chart lists the eight possible inputs and
-the operations required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Subtraction Guide Chart}
-\end{figure}
-
-Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
-algorithm from producing $-a - -a = -0$ as a result.  
-
-EXAM,bn_mp_sub.c
-
-Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
-and forward it to the end of the function.  On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
-``greater than or equal to'' comparison.  
-
-\section{Bit and Digit Shifting}
-MARK,POLY
-It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
-This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
-
-In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
-the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
-are on radix-$\beta$ digits.  
-
-\subsection{Multiplication by Two}
-
-In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
-operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = 2a$. \\
-\hline \\
-1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $r \ne 0$ then do \\
-\hspace{3mm}7.1  $b_{n + 1} \leftarrow r$ \\
-\hspace{3mm}7.2  $b.used \leftarrow b.used + 1$ \\
-8.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}8.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}8.1.1  $b_n \leftarrow 0$ \\
-9.  $b.sign \leftarrow a.sign$ \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2.}
-This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
-an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
-it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
-
-Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
-is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
-
-Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
-are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
-obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
-the previous carry.  Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
-forwarding the carry to the next iteration.
-
-Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
-Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
-
-EXAM,bn_mp_mul_2.c
-
-This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
-is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling.  
-
-\subsection{Division by Two}
-A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = a/2$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from $b.used - 1$ to $0$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2.}
-This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
-core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
-could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
-reading past the end of the array of digits.
-
-Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
-least significant bit not the most significant bit.  
-
-EXAM,bn_mp_div_2.c
-
-\section{Polynomial Basis Operations}
-Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
-the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
-place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
-division and Karatsuba multiplication.  
-
-Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
-$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
-polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
-
-\subsection{Multiplication by $x$}
-
-Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
-degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
-multiplying by the integer $\beta$.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
-2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  $a.used \leftarrow a.used + b$ \\
-5.  $i \leftarrow a.used - 1$ \\
-6.  $j \leftarrow a.used - 1 - b$ \\
-7.  for $n$ from $a.used - 1$ to $b$ do \\
-\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
-\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
-\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
-8.  for $n$ from 0 to $b - 1$ do \\
-\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lshd}
-\end{figure}
-
-\textbf{Algorithm mp\_lshd.}
-This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
-from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
-motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
-different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
-typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
-$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
-
-First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
-the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
-The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
-step 8 sets the lower $b$ digits to zero.
-
-\newpage
-FIGU,sliding_window,Sliding Window Movement
-
-EXAM,bn_mp_lshd.c
-
-The if statement on line @24,if@ ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line @42,top@ is an alias
-for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
-
-\subsection{Division by $x$}
-
-Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return. \\
-2.  If $a.used \le b$ then do \\
-\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
-\hspace{3mm}2.2  Return. \\
-3.  $i \leftarrow 0$ \\
-4.  $j \leftarrow b$ \\
-5.  for $n$ from 0 to $a.used - b - 1$ do \\
-\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
-\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
-\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
-6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
-\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.used \leftarrow a.used - b$ \\
-8.  Return. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rshd}
-\end{figure}
-
-\textbf{Algorithm mp\_rshd.}
-This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
-it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
-
-If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
-to the shift count $b$ then it will simply zero the input and return.
-
-After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
-is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
-Also the digits are copied from the leading to the trailing edge.
-
-Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
-
-EXAM,bn_mp_rshd.c
-
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
-
-\section{Powers of Two}
-
-Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
-example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
-shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
-
-\subsection{Multiplication by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
-\hline \\
-1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
-2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  If $b \ge lg(\beta)$ then \\
-\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
-\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
-5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $d \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-\hspace{3mm}6.4  If $r > 0$ then do \\
-\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
-\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2d.}
-This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
-quickly compute the product.
-
-First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
-$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
-left.
-
-After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
-required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
-Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
-variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
-
-This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
-complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
-
-EXAM,bn_mp_mul_2d.c
-
-Notes to be revised when code is updated. -- Tom
-
-\subsection{Division by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow a$ \\
-3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-4.  If $b \ge lg(\beta)$ then do \\
-\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
-5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $k \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2d.}
-This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
-mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
-by using algorithm mp\_mod\_2d.
-
-EXAM,bn_mp_div_2d.c
-
-The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
-ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
-result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
-the quotient is obtained.
-
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
-
-\subsection{Remainder of Division by Power of Two}
-
-The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
-algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mod\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b > a.used \cdot lg(\beta)$ then do \\
-\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}2.2  Return the result of step 2.1. \\
-3.  $c \leftarrow a$ \\
-4.  If step 3 failed return(\textit{MP\_MEM}). \\
-5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
-\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
-6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
-8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mod\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mod\_2d.}
-This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
-result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
-is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
-
-EXAM,bn_mp_mod_2d.c
-
--- Add comments later, Tom.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
-                      & in $O(n)$ time. \\
-                      &\\
-$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
-                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
-                      & upto $64$ with a hamming weight less than three. \\
-                      &\\
-$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
-                      & $2^k - 1$ as well. \\
-                      &\\
-$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
-                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
-                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
-                      & calculation.  \\
-                      & \\
-$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
-                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
-                      & the cost of addition. \\
-                      & \\
-$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
-                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
-                      & \\
-$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
-                      & calculating the result of a signed comparison. \\
-                      &
-\end{tabular}
-
-\chapter{Multiplication and Squaring}
-\section{The Multipliers}
-For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
-algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
-where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
-and squaring, leaving modular reductions for the subsequent chapter.  
-
-The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
-exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
-exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
-35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
-multiplications.
-
-For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
-against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
-overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
-1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
-This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
-
-\section{Multiplication}
-\subsection{The Baseline Multiplication}
-\index{baseline multiplication}
-Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
-algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algoritn since for two $n$-digit inputs $n^2$ single precision 
-multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
-simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
-
-The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
-used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
-facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
-modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
-will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
-
-Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
-include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
-constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}).
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-1.  If min$(a.used, b.used) < \delta$ then do \\
-\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
-\hspace{3mm}1.2  Return the result of step 1.1 \\
-\\
-Allocate and initialize a temporary mp\_int. \\
-2.  Init $t$ to be of size $digs$ \\
-3.  If step 2 failed return(\textit{MP\_MEM}). \\
-4.  $t.used \leftarrow digs$ \\
-\\
-Compute the product. \\
-5.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}5.1  $u \leftarrow 0$ \\
-\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
-\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
-\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
-\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
-6.  Clamp excess digits of $t$. \\
-7.  Swap $c$ with $t$ \\
-8.  Clear $t$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_mul\_digs}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_mul\_digs.}
-This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
-a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
-algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
-Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
-inputs.
-
-The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
-input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
-temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
-compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
-
-All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
-is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
-will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
-innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
-
-For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
-visualized in the following table.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|l|}
-\hline   &&          & 5 & 7 & 6 & \\
-\hline   $\times$&&  & 2 & 4 & 1 & \\
-\hline &&&&&&\\
-  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
-  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
-  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
-\hline  
-\end{tabular}
-\end{center}
-\caption{Long-Hand Multiplication Diagram}
-\end{figure}
-
-Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
-count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
-
-Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
-is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
-double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
-5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
-$t_{ix+iy}$ and the result would be lost.  
-
-At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
-digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
-exceed the precision requested.
-
-EXAM,bn_s_mp_mul_digs.c
-
-Lines @31,if@ to @35,}@ determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
-
-Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
-
-\subsection{Faster Multiplication by the ``Comba'' Method}
-MARK,COMBA
-
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
-
-\begin{equation}
-\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
-\end{equation}
-
-Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
-of $576$ and $241$.  
-
-\newpage\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|}
-  \hline &          & 5 & 7 & 6 & First Input\\
-  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
-\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
-                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
-   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
-\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
-\hline   
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Comba Multiplication Diagram}
-\end{figure}
-
-At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
-Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
-congruent to adding a leading zero digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Comba Fixup}. \\
-\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
-\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
-\hline \\
-1.  for $n$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
-\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
-2.  Return($\vec x$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Comba Fixup}
-\end{figure}
-
-With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
-$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
-efficient than the baseline algorithm why not simply always use this algorithm?
-
-\subsubsection{Column Weight.}
-At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
-independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
-the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
-three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
-an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
-min$(m, n)$ which is fairly obvious.
-
-The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
-from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
-two quantities we must not violate the following
-
-\begin{equation}
-k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
-\end{equation}
-
-Which reduces to 
-
-\begin{equation}
-k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
-\end{equation}
-
-Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
-found.
-
-\begin{equation}
-k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
-\end{equation}
-
-The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
-the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
-$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
-1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}).\\
-\\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
-\\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
-\\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_mul\_digs}
-\label{fig:COMBAMULT}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
-
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
-
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
-
-To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
-cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
-$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
-the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
-and addition operations in the nested loop in parallel.  
-
-EXAM,bn_fast_s_mp_mul_digs.c
-
-The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
-
-The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
-
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
-
-\subsection{Polynomial Basis Multiplication}
-To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
-the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
-$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
- 
-The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
-directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
-requires $O(n^2)$ time and would in practice be slower than the Comba technique.
-
-However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
-coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
-Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
-effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
-
-The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
-$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
-fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
-by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
-
-When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
-is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
-$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
-simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
-The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
-points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
-
-If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
-example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
-
-\begin{eqnarray}
-\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
-16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
-\end{eqnarray}
-
-Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
-polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
-
-As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
-multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
-$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
-summarizes the exponents for various values of $n$.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
-\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
-\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
-\hline $4$ & $1.403677461$ &\\
-\hline $5$ & $1.365212389$ &\\
-\hline $10$ & $1.278753601$ &\\
-\hline $100$ & $1.149426538$ &\\
-\hline $1000$ & $1.100270931$ &\\
-\hline $10000$ & $1.075252070$ &\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
-\label{fig:exponent}
-\end{figure}
-
-At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
-of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
-numbers.  
-
-\subsubsection{Cutoff Point}
-The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
-the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
-polynomial basis approach more costly to use with small inputs.
-
-Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
-point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
-when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
-
-The exact location of $y$ depends on several key architectural elements of the computer platform in question.
-
-\begin{enumerate}
-\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
-on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
-the cutoff point $y$ will be.  
-
-\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
-grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
-directly reflects on the ratio previous mentioned.
-
-\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
-influence over the cutoff point.
-
-\end{enumerate}
-
-A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
-is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
-a high resolution timer is available.  
-
-\subsection{Karatsuba Multiplication}
-Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
-general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
-light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
-
-\begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) + ac + bd)x + bd
-\end{equation}
-
-Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
-this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
-out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
-
-\begin{center}
-\begin{tabular}{rcrcrcrc}
-$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
-$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
-\end{tabular}
-\end{center}
-
-By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
-of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
-\hline \\
-1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
-2.  If step 2 failed then return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
-3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
-6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
-7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
-\\
-Calculate the three products. \\
-8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
-9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
-12.  $t1 \leftarrow t1 \cdot x0$ \\
-\\
-Calculate the middle term. \\
-13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
-\\
-Calculate the final product. \\
-15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
-16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
-17.  $t1 \leftarrow x0y0 + t1$ \\
-18.  $c \leftarrow t1 + x1y1$ \\
-19.  Clear all of the temporary variables. \\
-20.  Return(\textit{MP\_OKAY}).\\
-\hline 
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_mul.}
-This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
-from Knuth \cite[pp. 294-295]{TAOCPV2}.  
-
-\index{radix point}
-In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
-be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
-smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
-compute the lower halves.  Step 6 and 7 computer the upper halves.  
-
-After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
-of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
-
-The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
-
-EXAM,bn_mp_karatsuba_mul.c
-
-The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
-wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
-to handle error recovery with a single piece of code.  Lines @61,if@ to @75,if@ handle initializing all of the temporary variables 
-required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
-the temporaries that have been successfully allocated so far.
-
-The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
-additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
-number of digits for the next section of code.
-
-The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
-to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
-\textbf{sign} members are copied first.  The first for loop on line @98,for@ copies the lower halves.  Since they are both the same magnitude it 
-is simpler to calculate both lower halves in a single loop.  The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and 
-$y1$ respectively.
-
-By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
-
-When line @152,err@ is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
-the same code that handles errors can be used to clear the temporary variables and return.  
-
-\subsection{Toom-Cook $3$-Way Multiplication}
-Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 3$ except that the points  are 
-chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
-$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
-of the $W(x)$.
-
-With the five relations that Toom-Cook specifies, the following system of equations is formed.
-
-\begin{center}
-\begin{tabular}{rcrcrcrcrcr}
-$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
-$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
-$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
-$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
-$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
-\end{tabular}
-\end{center}
-
-A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
-of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
-the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
-(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
-\hline \\
-Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
-1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
-2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-\\
-Find the five equations for $w_0, w_1, ..., w_4$. \\
-8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
-9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
-10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
-11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
-13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
-14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
-15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
-16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
-\\
-Continued on the next page.\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
-\hline \\
-Now solve the system of equations. \\
-18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
-19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
-20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
-21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
-23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
-24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
-\\
-Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
-26. for $n$ from $1$ to $4$ do \\
-\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
-27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
-28. Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul (continued)}
-\end{figure}
-
-\textbf{Algorithm mp\_toom\_mul.}
-This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
-algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
-description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
-any given step.
-
-The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
-integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
-
-The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
-to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
-$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
-
-After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
-$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
-the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
-that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
-
-Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
-result $a \cdot b$ is produced.
-
-EXAM,bn_mp_toom_mul.c
-
--- Comments to be added during editing phase.
-
-\subsection{Signed Multiplication}
-Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
-of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot b$ \\
-\hline \\
-1.  If $a.sign = b.sign$ then \\
-\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
-2.  else \\
-\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
-3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
-\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
-4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
-\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
-5.  else \\
-\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
-\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
-\hspace{3mm}5.3  else \\
-\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
-6.  $c.sign \leftarrow sign$ \\
-7.  Return the result of the unsigned multiplication performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_mul.}
-This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
-available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
-s\_mp\_mul\_digs will clear it.  
-
-EXAM,bn_mp_mul.c
-
-The implementation is rather simplistic and is not particularly noteworthy.  Line @22,?@ computes the sign of the result using the ``?'' 
-operator from the C programming language.  Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
-
-\section{Squaring}
-
-Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
-available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
-performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
-the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
-$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
-and $3 \cdot 1 = 1 \cdot 3$. 
-
-For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
-required for multiplication.  The following diagram gives an example of the operations required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{ccccc|c}
-&&1&2&3&\\
-$\times$ &&1&2&3&\\
-\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
-       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
-         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
-\end{tabular}
-\end{center}
-\caption{Squaring Optimization Diagram}
-\end{figure}
-
-MARK,SQUARE
-Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
-represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
-
-The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
-appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
-products and at most one square (\textit{see the exercise section}).  
-
-The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
-occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
-Column two of row one is a square and column three is the first unique column.
-
-\subsection{The Baseline Squaring Algorithm}
-The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
-will not handle.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}) \\
-3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
-4.  For $ix$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}Calculate the square. \\
-\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
-\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}Calculate the double products after the square. \\
-\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
-\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}Set the last carry. \\
-\hspace{3mm}4.5  While $u > 0$ do \\
-\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
-\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
-6.  Exchange $b$ and $t$. \\
-7.  Clear $t$ (\textit{mp\_clear}) \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sqr.}
-This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
-\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
-destination mp\_int to be the same as the source mp\_int.
-
-The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
-the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
-the carry and compute the double products.  
-
-The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
-very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
-when it is multiplied by two, it can be properly represented by a mp\_word.
-
-Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
-results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
-
-EXAM,bn_s_mp_sqr.c
-
-Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@.  Line @42,>>@ extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively.  The doubling is performed using two
-additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast.  
-
-\subsection{Faster Squaring by the ``Comba'' Method}
-A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
-drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
-performance hazards.
-
-The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
-propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
-that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
-$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
-
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
-1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
-2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_ix \right )^2$ \\
-\\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
-\\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
-
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
-
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
-
-EXAM,bn_fast_s_mp_sqr.c
-
--- Write something deep and insightful later, Tom.
-
-\subsection{Polynomial Basis Squaring}
-The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
-is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
-multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
-
-\subsection{Karatsuba Squaring}
-Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
-Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
-number with the following equation.
-
-\begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
-\end{equation}
-
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
-Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
-$O \left ( n^{lg(3)} \right )$.
-
-You might ask yourself, if the asymptotic time of Karatsuba squaring and multiplication is the same, why not simply use the multiplication algorithm 
-instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
-time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
-point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
-
-Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
-The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
-were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
-2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1\beta^B + x0$ \\
-3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
-\\
-Calculate the three squares. \\
-6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
-7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-9.  $t1 \leftarrow t1^2$ \\
-\\
-Compute the middle term. \\
-10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
-\\
-Compute final product. \\
-12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
-13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
-14.  $t1 \leftarrow t1 + x0x0$ \\
-15.  $b \leftarrow t1 + x1x1$ \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_sqr.}
-This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
-multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
-
-The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
-placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
-as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
-
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
-Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
-this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
-
-Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
-machine clock cycles.}. 
-
-\begin{equation}
-5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
-\end{equation}
-
-For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
-\begin{center}
-\begin{tabular}{rcl}
-${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
-${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
-${13 \over 9}$     & $<$ & $n$ \\
-\end{tabular}
-\end{center}
-
-This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
-where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
-the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
-ratio of 1:7.  } than simpler operations such as addition.  
-
-EXAM,bn_mp_karatsuba_sqr.c
-
-This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
-shift the input into the two halves.  The loop from line @54,{@ to line @70,}@ has been modified since only one input exists.  The \textbf{used}
-count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
-to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
-
-By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
-is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
-it is actually below the Comba limit (\textit{at 110 digits}).
-
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
-
-\subsection{Toom-Cook Squaring}
-The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
-derive their own Toom-Cook squaring algorithm.  
-
-\subsection{High Level Squaring}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
-\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
-2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
-\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
-3.  else \\
-\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
-\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
-\hspace{3mm}3.3  else \\
-\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
-4.  $b.sign \leftarrow MP\_ZPOS$ \\
-5.  Return the result of the unsigned squaring performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_sqr.}
-This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
-\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
-neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
-
-EXAM,bn_mp_sqr.c
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
-                      & that have different number of digits in Karatsuba multiplication. \\
-                      & \\
-$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\
-                      & of double products and at most one square is stated.  Prove this statement. \\
-                      & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
-$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
-                      & \\
-$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
-                      & \\ 
-$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
-                      & required for equation $6.7$ to be true.  \\
-                      & \\
-\end{tabular}
-
-\chapter{Modular Reduction}
-MARK,REDUCTION
-\section{Basics of Modular Reduction}
-\index{modular residue}
-Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
-such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be reduced 
-modulo another number $b$ by finding the remainder of the division $a/b$.  
-
-Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
-$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
-``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
-other forms of residues.  
-
-\index{modulus}
-Modular reductions are normally used to form finite groups such as fields and rings.  For example, in the RSA public key algorithm \cite{RSAPAPER} 
-two private primes $p$ and $q$ are chosen which when multiplied $n = pq$ forms a composite modulus.  When operations such as multiplication and
-squaring are performed on units of the ring $\Z_n$ a finite multiplicative sub-group is formed.
-
-Modular reductions have a variety of other useful properties.  For example, a number $x$ is a square if and only if it is a quadratic
-residue modulo a prime.  With a finite set of primes $B = \left < p_0, p_1, \ldots, p_n \right >$ a quick test for whether $x$ is square or not can 
-be performed\footnote{Provided none of the primes from $B$ divide $x$.}.  Consider the figure~\ref{fig:QR} with the candiate $x = 955621$ a simple 
-set of modular reductions modulo $3, 5, \ldots, 11$ may detect whether $x$ is a square or not.  In this case $955621 \equiv 7 \mbox{ (mod }11\mbox{)}$ 
-and since $7$ is not a quadratic residue modulo $11$ the number $955621$ is not a square.  
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Prime} & \textbf{Quadratic Residues} \\
-\hline $3$            & $1$ \\
-\hline $5$            & $1, 4$ \\
-\hline $7$            & $1, 2, 4$ \\
-\hline $11$           & $1, 3, 4, 5, 9$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Quadratic Residues for primes less than $13$}
-\label{fig:QR}
-\end{figure}
-
-The most common usage for performance driven modular reductions is in modular exponentiation algorithms.  That is to compute 
-$d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  As will be discussed in the subsequent chapter there exists fast algorithms for computing
-modular exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial
-results in the range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.
-
-\section{The Barrett Reduction}
-The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
-division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
-
-\begin{equation}
-c = a - b \cdot \lfloor a/b \rfloor
-\end{equation}
-
-Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP intuition would indicate the next step 
-would be to replace $a/b$ by a multiplication by the reciprocal.  However, DSP intuition on its own will not work as these numbers are considerably
-larger than the precision of common DSP floating point data types.  It would take another common optimization to optimize the algorithm.
-
-\subsection{Fixed Point Arithmetic}
-The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
-point arithmetic would vastly popularlize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were fairly slow.  The idea behind
-fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit integer and a $q$-bit fraction part 
-(\textit{where $p+q = k$}).  
-
-In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
-value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized. For example, 
-with $q = 4$ to multiply the integers $9$ and $5$ they must be converted to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ 
-represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the fixed point representation of $5$.  The product $ab$ is equal to
-$45(2^{2q})$ which when normalized produces $45(2^q)$.  
-
-Using fixed point arithmetic division can be easily achieved by multiplying by the reciprocal.  If $2^q$ is equivalent to one than $2^q/b$ is 
-equivalent to $1/b$ using real arithmetic.  Using this fact dividing an integer $a$ by another integer $b$ can be achieved with the following
-expression.
-
-\begin{equation}
-\lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor
-\end{equation}
-
-The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
-modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
-are considerably faster than division on most processors.  
-
-Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
-leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
-the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  
-
-Plugging this form of divison into the original equation the following modular residue equation arises.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor
-\end{equation}
-
-Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
-variable also helps re-inforce the idea that it is meant to be computed once and re-used.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
-\end{equation}
-
-Provided that $2^q > b^2$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  Let $n$ represent
-the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
-another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
-reduce the number.  
-
-For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
-$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
-By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
-
-\subsection{Choosing a Radix Point}
-Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
-that could be achieved a full division might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
-the initial multiplication that finds the quotient.  
-
-Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
-the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$.  Dividing $a$ by 
-$b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the $m - 1$'th digit of $a$ will contribute at most a value
-of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  
-
-Since those digits do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
-``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
-with the zeroes trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
-
-\begin{equation}
-c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
-\end{equation}
-
-Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$. Also note that the exponent on the divisor when added to the amount $q_0$
-was shifted by equals $2m$.  If the optimization had not been performed the divisor would have the exponent $2m$ so in the end the exponents
-do ``add up''. Using the above equation the quotient $\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most 
-two implying that $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then 
-conditionally subtracting $b$ once or twice the residue is found.
-
-The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
-precision multiplications.  In total $2m^2 + m$ single precision multiplications are required which is considerably faster than the original
-attempt.
-
-For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
-represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
-With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
-is found.  
-
-\subsection{Trimming the Quotient}
-So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
-it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
-optimization.  
-
-After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
-half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
-multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
-In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
-
-The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
-multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
-of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
-
-\subsection{Trimming the Residue}
-After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
-multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
-result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
-implicitly zero.  
-
-The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
-$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
-be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
-only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
-
-With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
-is considerably faster than the straightforward $3m^2$ method.  
-
-\subsection{The Barrett Algorithm}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor$ $(0 \le a < b^2, b > 1)$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
-\hline \\
-Let $m$ represent the number of digits in $b$.  \\
-1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
-2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
-\\
-Produce the quotient. \\
-3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
-4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
-\\
-Subtract the multiple of modulus from the input. \\
-5.  $c \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
-7.  $c \leftarrow c - q$ (\textit{mp\_sub}) \\
-\\
-Add $\beta^{m+1}$ if a carry occured. \\
-8.  If $c < 0$ then (\textit{mp\_cmp\_d}) \\
-\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
-\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
-\hspace{3mm}8.3  $c \leftarrow c + q$ \\
-\\
-Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
-9.  While $c \ge b$ do (\textit{mp\_cmp}) \\
-\hspace{3mm}9.1  $c \leftarrow c - b$ \\
-10.  Clear $q$. \\
-11.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce.}
-This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
-\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must be adhered to
-for the algorithm to work.
-
-First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
-a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
-for the quotient to have enough precision.  Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The
-value of $\mu$ is passed as an argument to this algorithm and is assumed to be setup before the algorithm is used.  
-
-Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
-$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  This optimal algorithm can only be used if the number
-of digits in $b$ is very much smaller than $\beta$.  
-
-After the multiple of the modulus has been subtracted from $a$ the residue must be fixed up in case its negative.  While it is known that 
-$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue.  In this case 
-the invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
-
-The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is only
-performed upto two times.  However, if $a \ge b^2$ than it will iterate substantially more times than it should.
-
-EXAM,bn_mp_reduce.c
-
-The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
-the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
-in the modulus.  In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is
-safe to do so.  
-
-\subsection{The Barrett Setup Algorithm}
-In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
-future use so that the Barrett algorithm can be used without delay.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_setup}. \\
-\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
-\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
-\hline \\
-1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
-2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_setup.}
-This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
-is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
-
-EXAM,bn_mp_reduce_setup.c
-
-This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
-which would received the remainder is passed as NULL.  As will be discussed in ~DIVISION~ the division routine allows both the quotient and the 
-remainder to be passed as NULL meaning to ignore the value.  
-
-\section{The Montgomery Reduction}
-Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
-form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
-residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
-
-Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
-$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
-is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
-
-\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
-to explain this is that $n$ (\textit{or multiples of $n$}) is congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
-
-\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
-this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
-multiplication by $k^{-1}$ modulo $n$.  
-
-From these two simple facts the following simple algorithm can be derived.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If $x$ is odd then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
-\hspace{3mm}1.2  $x \leftarrow x/2$ \\
-2.  Return $x$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction}
-\end{figure}
-
-The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
-added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
-$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
-final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
-$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
-\hline $2$ & $x/2 = 1453$ \\
-\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
-\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
-\hline $5$ & $x/2 = 278$ \\
-\hline $6$ & $x/2 = 139$ \\
-\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
-\hline $8$ & $x/2 = 99$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (I)}
-\label{fig:MONT1}
-\end{figure}
-
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The final result $r = 99$ which is actually
-$2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$ can reveal the residue $x \equiv 158$ by multiplying by $2^8$ modulo $n$.  
-
-Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
-and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
-Fortunately there exists an alternative representation of the algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
-2.  Return $x/2^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified I)}
-\end{figure}
-
-This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
-precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + 2^{0}n = 5812$ \\
-\hline $2$ & $5812$ \\
-\hline $3$ & $x + 2^{2}n = 6840$ \\
-\hline $4$ & $x + 2^{3}n = 8896$ \\
-\hline $5$ & $8896$ \\
-\hline $6$ & $8896$ \\
-\hline $7$ & $x + 2^{6}n = 25344$ \\
-\hline $8$ & $25344$ \\
-\hline -- & $x/2^k = 99$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (II)}
-\label{fig:MONT2}
-\end{figure}
-
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 4093$ modulo $n = 257$ with $k = 8$. 
-With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
-loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
-zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
-
-\subsection{Digit Based Montgomery Reduction}
-Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
-previous algorithm re-written to compute the Montgomery reduction in this new fashion.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
-2.  Return $x/\beta^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified II)}
-\end{figure}
-
-The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
-the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
-problem breaks down to solving the following congruency.  
-
-\begin{center}
-\begin{tabular}{rcl}
-$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\end{tabular}
-\end{center}
-
-In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
-extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
-
-For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
-represent the value to reduce.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
-\hline --                 & $33$ & --\\
-\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
-\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Montgomery Reduction}
-\end{figure}
-
-The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
-which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
-the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
-the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
-
-\subsection{Baseline Montgomery Reduction}
-The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
-Montgomery reductions.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  $digs \leftarrow 2n.used + 1$ \\
-2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
-\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
-\\
-Setup $x$ for the reduction. \\
-3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
-4.  $x.used \leftarrow digs$ \\
-\\
-Eliminate the lower $k$ digits. \\
-5.  For $ix$ from $0$ to $k - 1$ do \\
-\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.2  $u \leftarrow 0$ \\
-\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
-\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
-\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.4  While $u > 0$ do \\
-\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
-\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Divide by $\beta^k$ and fix up as required. \\
-6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
-7.  If $x \ge n$ then \\
-\hspace{3mm}7.1  $x \leftarrow x - n$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_reduce.}
-This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
-on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
-restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
-for the Barrett algorithm.  Additionally $n > 1$ will ensure a modular inverse $\rho$ exists.  $\rho$ must be calculated in
-advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
-
-Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
-the size of the input.  This algorithm is discussed in ~COMBARED~.
-
-Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
-calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
-multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
-
-Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
-in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
-multiplications.  
-
-EXAM,bn_mp_montgomery_reduce.c
-
-This is the baseline implementation of the Montgomery reduction algorithm.  Lines @30,digs@ to @35,}@ determine if the Comba based
-routine can be used instead.  Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop.  
-
-The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
-the alias $tmpn$ refers to the modulus $n$.  
-
-\subsection{Faster ``Comba'' Montgomery Reduction}
-MARK,COMBARED
-
-The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
-nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
-technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
-a $k \times 1$ product $k$ times. 
-
-The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
-carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
-Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
-
-With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
-the speed of the algorithm.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
-1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
-Copy the digits of $x$ into the array $\hat W$ \\
-2.  For $ix$ from $0$ to $x.used - 1$ do \\
-\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
-3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-Elimiate the lower $k$ digits. \\
-4.  for $ix$ from $0$ to $n.used - 1$ do \\
-\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
-\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Propagate carries upwards. \\
-5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
-\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Shift right and reduce modulo $\beta$ simultaneously. \\
-6.  for $ix$ from $0$ to $n.used + 1$ do \\
-\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
-Zero excess digits and fixup $x$. \\
-7.  if $x.used > n.used + 1$ then do \\
-\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
-\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
-8.  $x.used \leftarrow n.used + 1$ \\
-9.  Clamp excessive digits of $x$. \\
-10.  If $x \ge n$ then \\
-\hspace{3mm}10.1  $x \leftarrow x - n$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
-This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
-faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
-on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
-the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
-a modulus of at most $3,556$ bits in length.  
-
-As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
-contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
-4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
-as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
-a single precision multiplication instead half the amount of time is spent.
-
-Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
-4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
-how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
-point.
-
-Step 5 will propgate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
-stored in the destination $x$.  
-
-EXAM,bn_fast_mp_montgomery_reduce.c
-
-The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@.  Both loops share
-the same alias variables to make the code easier to read.  
-
-The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
-forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line @101,>>@ fixes the carry 
-for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
-
-The for loop on line @113,for@ propagates the rest of the carries upwards through the columns.  The for loop on line @126,for@ reduces the columns
-modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
-digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
-
-\subsection{Montgomery Setup}
-To calculate the variable $\rho$ a relatively simple algorithm will be required.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
-\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
-\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\hline \\
-1.  $b \leftarrow n_0$ \\
-2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
-4.  for $k$ from 0 to $3$ do \\
-\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
-5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_setup} 
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_setup.}
-This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
-to calculate $1/n_0$ when $\beta$ is a power of two.  
-
-EXAM,bn_mp_montgomery_setup.c
-
-This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
-multiplications when $\beta$ is not the default 28-bits.  
-
-\section{The Diminished Radix Algorithm}
-The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
-or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
-
-\begin{equation}
-(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
-\end{equation}
-
-This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
-then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
-of the above equation is very simple.  First write $x$ in the product form.
-
-\begin{equation}
-x = qn + r
-\end{equation}
-
-Now reduce both sides modulo $(n - k)$.
-
-\begin{equation}
-x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
-\end{equation}
-
-The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
-into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Diminished Radix Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$, $k$ \\
-\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
-\hline \\
-1.  $q \leftarrow \lfloor x / n \rfloor$ \\
-2.  $q \leftarrow k \cdot q$ \\
-3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
-4.  $x \leftarrow x + q$ \\
-5.  If $x \ge (n - k)$ then \\
-\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
-\hspace{3mm}5.2  Goto step 1. \\
-6.  Return $x$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Diminished Radix Reduction}
-\label{fig:DR}
-\end{figure}
-
-This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
-once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
-
-\begin{equation} 
-0 \le x < n^2 + k^2 - 2nk
-\end{equation}
-
-The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
-
-\begin{equation}
-q < n - 2k - k^2/n
-\end{equation}
-
-Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
-$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
-
-\begin{equation}
-0 \le q + x < (k + 1)n - 2k^2 - 1
-\end{equation}
-
-With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
-sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
-range $0 \le x < (n - k - 1)^2$.  
-
-\begin{figure}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|}
-\hline
-$x = 123456789, n = 256, k = 3$ \\
-\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
-$q \leftarrow q*k = 1446759$ \\
-$x \leftarrow x \mbox{ mod } n = 21$ \\
-$x \leftarrow x + q = 1446780$ \\
-$x \leftarrow x - (n - k) = 1446527$ \\
-\hline 
-$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
-$q \leftarrow q*k = 16950$ \\
-$x \leftarrow x \mbox{ mod } n = 127$ \\
-$x \leftarrow x + q = 17077$ \\
-$x \leftarrow x - (n - k) = 16824$ \\
-\hline 
-$q \leftarrow \lfloor x/n \rfloor = 65$ \\
-$q \leftarrow q*k = 195$ \\
-$x \leftarrow x \mbox{ mod } n = 184$ \\
-$x \leftarrow x + q = 379$ \\
-$x \leftarrow x - (n - k) = 126$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example Diminished Radix Reduction}
-\label{fig:EXDR}
-\end{figure}
-
-Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
-is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
-three passes were required to find the residue $x \equiv 126$.
-
-
-\subsection{Choice of Moduli}
-On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
-modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate moduli is chosen.
-
-Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
-Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
-by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
-which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
-
-However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
-performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
-Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ requires zeroing the digits above the $p-1$'th digit of $x$.  
-
-Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ where as the term ``unrestricted
-modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
-$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
-
-\subsection{Choice of $k$}
-Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
-in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
-as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
-
-\subsection{Restricted Diminished Radix Reduction}
-The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
-an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
-of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
-of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
-exponentiations are performed.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
-\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
-\textbf{Output}.  $x \mbox{ mod } n$ \\
-\hline \\
-1.  $m \leftarrow n.used$ \\
-2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
-3.  $\mu \leftarrow 0$ \\
-4.  for $i$ from $0$ to $m - 1$ do \\
-\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
-\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  $x_{m} \leftarrow \mu$ \\
-6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
-\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
-7.  Clamp excess digits of $x$. \\
-8.  If $x \ge n$ then \\
-\hspace{3mm}8.1  $x \leftarrow x - n$ \\
-\hspace{3mm}8.2  Goto step 3. \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_reduce.}
-This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
-with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
-
-This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
-and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
-the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
-digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
-$x$ before the addition of the multiple of the upper half.  
-
-At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
-at step 3.  
-
-EXAM,bn_mp_dr_reduce.c
-
-The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line @49,top:@ is where
-the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
-the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
-
-The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
-a division by $\beta^m$ can be simulated virtually for free.  The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
-in this algorithm.
-
-By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line @71,for@ the 
-same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
-
-Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
-With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
-as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
-does not need to be checked.
-
-\subsubsection{Setup}
-To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
-completeness.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_setup}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $k = \beta - n_0$ \\
-\hline \\
-1.  $k \leftarrow \beta - n_0$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_setup}
-\end{figure}
-
-EXAM,bn_mp_dr_setup.c
-
-\subsubsection{Modulus Detection}
-Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
-of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
-\hline
-1.  If $n.used < 2$ then return($0$). \\
-2.  for $ix$ from $1$ to $n.used - 1$ do \\
-\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
-3.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_is\_modulus}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_is\_modulus.}
-This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
-in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
-step 3 then $n$ must of Diminished Radix form.
-
-EXAM,bn_mp_dr_is_modulus.c
-
-\subsection{Unrestricted Diminished Radix Reduction}
-The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
-is a straightforward adaptation of algorithm~\ref{fig:DR}.
-
-In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
-algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k}. \\
-\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
-\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
-\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  While $a \ge n$ do \\
-\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
-\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
-\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.5  If $a \ge n$ then do \\
-\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k.}
-This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
-shift which makes the algorithm fairly inexpensive to use.  
-
-EXAM,bn_mp_reduce_2k.c
-
-The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
-on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
-is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
-any multiplications.  
-
-The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
-positive.  By using the unsigned versions the overhead is kept to a minimum.  
-
-\subsubsection{Unrestricted Setup}
-To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $k = 2^p - n$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
-3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
-4.  $k \leftarrow x_0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k\_setup.}
-This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
-is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
-
-EXAM,bn_mp_reduce_2k_setup.c
-
-\subsubsection{Unrestricted Detection}
-An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
-
-\begin{enumerate}
-\item  The number has only one digit.
-\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
-\end{enumerate}
-
-If either condition is true than there is a power of two namely $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
-one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
-that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
-significant bit.  The resulting sum will be a power of two.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
-\hline
-1.  If $n.used = 0$ then return($0$). \\
-2.  If $n.used = 1$ then return($1$). \\
-3.  $p \leftarrow \rceil lg(n) \lceil$  (\textit{mp\_count\_bits}) \\
-4.  for $x$ from $lg(\beta)$ to $p$ do \\
-\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
-5.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_is\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_is\_2k.}
-This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
-
-EXAM,bn_mp_reduce_is_2k.c
-
-
-
-\section{Algorithm Comparison}
-So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
-that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
-all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
-
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
-\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
-\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
-\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-
-In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
-reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
-calling the half precision multipliers, addition and division by $\beta$ algorithms.
-
-For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
-shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
-primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
-modular exponentiation to greatly speed up the operation.
-
-
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
-                     & calculates the correct value of $\rho$. \\
-                     & \\
-$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
-                     & \\
-$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
-                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
-                     & terminate within $1 \le k \le 10$ iterations. \\
-                     & \\
-\end{tabular}                     
-
-
-\chapter{Exponentiation}
-Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
-in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
-cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
-such cryptosystem and many methods have been sought to speed it up.
-
-\section{Exponentiation Basics}
-A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
-the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
-with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
-
-Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
-are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
-significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
-
-\begin{equation}
-a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
-\end{equation}
-
-By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
-
-\begin{equation}
-b = \sum_{i=0}^{k-1}2^i \cdot b_i
-\end{equation}
-
-The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
-$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
-$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
-
-While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
-be computed in an auxilary variable.  Consider the following equivalent algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Left to Right Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$ and $k$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $k - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Left to Right Exponentiation}
-\label{fig:LTOR}
-\end{figure}
-
-This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
-multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
-product.  
-
-For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|}
-\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
-\hline - & $1$ \\
-\hline $5$ & $a$ \\
-\hline $4$ & $a^2$ \\
-\hline $3$ & $a^4 \cdot a$ \\
-\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
-\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
-\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Left to Right Exponentiation}
-\end{figure}
-
-When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
-called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
-
-\subsection{Single Digit Exponentiation}
-The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
-to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
-$b$ that are greater than three.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_expt\_d}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
-2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
-3.  for $x$ from 1 to $lg(\beta)$ do \\
-\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
-\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
-\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
-\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
-4.  Clear $g$. \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_expt\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_expt\_d.}
-This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
-quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
-exponent is a fixed width.  
-
-A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
-$1$ in the subsequent step.
-
-Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
-on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
-of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
-iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
-
-EXAM,bn_mp_expt_d.c
-
--- Some note later.
-
-\section{$k$-ary Exponentiation}
-When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
-slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
-the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
-computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
-portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
-\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
-\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{$k$-ary Exponentiation}
-\label{fig:KARY}
-\end{figure}
-
-The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
-precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
-$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
-However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
-
-Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
-original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
-has increased slightly but the number of multiplications has nearly halved.
-
-\subsection{Optimal Values of $k$}
-An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
-approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
-for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
-\hline $16$ & $2$ & $27$ & $24$ \\
-\hline $32$ & $3$ & $49$ & $48$ \\
-\hline $64$ & $3$ & $92$ & $96$ \\
-\hline $128$ & $4$ & $175$ & $192$ \\
-\hline $256$ & $4$ & $335$ & $384$ \\
-\hline $512$ & $5$ & $645$ & $768$ \\
-\hline $1024$ & $6$ & $1257$ & $1536$ \\
-\hline $2048$ & $6$ & $2452$ & $3072$ \\
-\hline $4096$ & $7$ & $4808$ & $6144$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
-\label{fig:OPTK}
-\end{figure}
-
-\subsection{Sliding-Window Exponentiation}
-A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
-this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
-algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
-
-Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
-\hline $16$ & $3$ & $24$ & $27$ \\
-\hline $32$ & $3$ & $45$ & $49$ \\
-\hline $64$ & $4$ & $87$ & $92$ \\
-\hline $128$ & $4$ & $167$ & $175$ \\
-\hline $256$ & $5$ & $322$ & $335$ \\
-\hline $512$ & $6$ & $628$ & $645$ \\
-\hline $1024$ & $6$ & $1225$ & $1257$ \\
-\hline $2048$ & $7$ & $2403$ & $2452$ \\
-\hline $4096$ & $8$ & $4735$ & $4808$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
-\label{fig:OPTK2}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
-\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
-\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
-\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
-\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Sliding Window $k$-ary Exponentiation}
-\end{figure}
-
-Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
-algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
-the size as the previous table.  
-
-Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
-the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
-exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
-a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
-squarings.  The second method requires $8$ multiplications and $18$ squarings.  
-
-In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
-
-\section{Modular Exponentiation}
-
-Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
-$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
-modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
-
-This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
-one of the algorithms presented in ~REDUCTION~.  
-
-Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
-will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
-value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see ~MODINV~}).  If no inverse exists the algorithm
-terminates with an error.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  If $b.sign = MP\_NEG$ then \\
-\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
-\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
-\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
-3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
-\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
-4.  else \\
-\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_exptmod}
-\end{figure}
-
-\textbf{Algorithm mp\_exptmod.}
-The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
-which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
-except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
-algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
-
-EXAM,bn_mp_exptmod.c
-
-In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input.  If the exponent is
-negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
-the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
-exponent.
-
-If the exponent is positive the algorithm resumes the exponentiation.  Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix 
-form.  If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
-of three values.
-
-\begin{enumerate}
-\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
-\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
-\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
-\end{enumerate}
-
-Line @69,if@ determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
-the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
-
-\subsection{Barrett Modular Exponentiation}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  $k \leftarrow lg(x)$ \\
-2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
-                              2 &  \mbox{if }k \le 7 \\
-                              3 &  \mbox{if }7 < k \le 36 \\
-                              4 &  \mbox{if }36 < k \le 140 \\
-                              5 &  \mbox{if }140 < k \le 450 \\
-                              6 &  \mbox{if }450 < k \le 1303 \\
-                              7 &  \mbox{if }1303 < k \le 3529 \\
-                              8 &  \mbox{if }3529 < k \\
-                              \end{array} \right .$ \\
-3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
-4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
-5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
-\\
-Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
-6.  $k \leftarrow 2^{winsize - 1}$ \\
-7.  $M_{k} \leftarrow M_1$ \\
-8.  for $ix$ from 0 to $winsize - 2$ do \\
-\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
-\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
-\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
-\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-10.  $res \leftarrow 1$ \\
-\\
-Start Sliding Window. \\
-11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
-12.  Loop \\
-\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
-\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
-\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
-\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
-\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
-\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
-Continued on next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
-\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
-\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
-\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
-\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
-\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.6.3  Goto step 12. \\
-\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
-\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
-\hspace{3mm}12.9  $mode \leftarrow 2$ \\
-\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
-\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
-\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
-\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
-\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
-\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}Reset the window. \\
-\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
-\\
-No more windows left.  Check for residual bits of exponent. \\
-13.  If $mode = 2$ and $bitcpy > 0$ then do \\
-\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
-\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
-\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
-\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
-\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
-\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-14.  $y \leftarrow res$ \\
-15.  Clear $res$, $mu$ and the $M$ array. \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod (continued)}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_exptmod.}
-This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
-algorithm to keep the product small throughout the algorithm.
-
-The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
-larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
-table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
-
-After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
-the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
-times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
-
-Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
-\begin{enumerate}
-\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
-\begin{enumerate}
-   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
-         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
-   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
-         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
-   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
-         downwards.
-\end{enumerate}
-\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
-      is fetched from the exponent.
-\item The variable $buf$ holds the currently read digit of the exponent. 
-\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
-\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
-      the appropriate operations performed.
-\item The variable $bitbuf$ holds the current bits of the window being formed.  
-\end{enumerate}
-
-All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
-inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
-read and if there are no digits left than the loop terminates.  
-
-After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
-upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
-trailing edges the entire exponent is read from most significant bit to least significant bit.
-
-At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
-algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
-the two cases of $mode = 1$ and $mode = 2$ respectively.  
-
-FIGU,expt_state,Sliding Window State Diagram
-
-By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
-a Left-to-Right algorithm is used to process the remaining few bits.  
-
-EXAM,bn_s_mp_exptmod.c
-
-Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
-from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line @32,if@ the value of $x$ is already known to be greater than $140$.  
-
-The conditional piece of code beginning on line @42,define@ allows the window size to be restricted to five bits.  This logic is used to ensure
-the table of precomputed powers of $G$ remains relatively small.  
-
-The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for
-Barrett reduction.  
-
--- More later.
-
-\section{Quick Power of Two}
-Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
-equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_2expt}. \\
-\textbf{Input}.   integer $b$ \\
-\textbf{Output}.  $a \leftarrow 2^b$ \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
-3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
-4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_2expt}
-\end{figure}
-
-\textbf{Algorithm mp\_2expt.}
-
-EXAM,bn_mp_2expt.c
-
-\chapter{Higher Level Algorithms}
-
-This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
-routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
-
-The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
-for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
-These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
-various representations of integers.  For example, converting from an mp\_int to a string of character.
-
-\section{Integer Division with Remainder}
-MARK,DIVISION
-
-Integer division aside from modular exponentiation is most intensive algorithm to compute.  
-
-
-\section{Single Digit Helpers}
-\subsection{Single Digit Addition}
-\subsection{Single Digit Subtraction}
-\subsection{Single Digit Multiplication}
-\subsection{Single Digit Division}
-\subsection{Single Digit Modulo}
-\subsection{Single Digit Root Extraction}
-\section{Random Number Generation}
-\section{Formatted Output}
-\subsection{Getting The Output Size}
-\subsection{Generating Radix-n Output}
-\subsection{Reading Radix-n Input}
-\section{Unformatted Output}
-\subsection{Getting The Output Size}
-\subsection{Generating Output}
-\subsection{Reading Input}
-
-\chapter{Number Theoretic Algorithms}
-\section{Greatest Common Divisor}
-\section{Least Common Multiple}
-\section{Jacobi Symbol Computation}
-\section{Modular Inverse}
-MARK,MODINV
-\subsection{General Case}
-\subsection{Odd Moduli}
-\section{Primality Tests}
-\subsection{Trial Division}
-\subsection{The Fermat Test}
-\subsection{The Miller-Rabin Test}
-\subsection{Primality Test in a Bottle}
-\subsection{The Next Prime}
-\section{Root Extraction}
-
-\backmatter
-\appendix
-\begin{thebibliography}{ABCDEF}
-\bibitem[1]{TAOCPV2}
-Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
-
-\bibitem[2]{HAC}
-A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
-
-\bibitem[3]{ROSE}
-Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
-
-\bibitem[4]{COMBA}
-Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
-
-\bibitem[5]{KARA}
-A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
-
-\bibitem[6]{KARAP}
-Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
-
-\bibitem[7]{BARRETT}
-Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
-
-\bibitem[8]{MONT}
-P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
-
-\bibitem[9]{DRMET}
-Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
-
-\bibitem[10]{MMB}
-J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
-
-\end{thebibliography}
-
-\input{tommath.ind}
-
-\chapter{Appendix}
-\subsection*{Appendix A -- Source Listing of tommath.h}
-
-The following is the source listing of the header file ``tommath.h'' for the LibTomMath project.  It contains many of 
-the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on.  The header is 
-presented here for completeness.
-
-LIST,tommath.h
-
-\end{document}
\ No newline at end of file
diff --git a/tommath.tex b/tommath.tex
deleted file mode 100644
index cd2a97e..0000000
--- a/tommath.tex
+++ /dev/null
@@ -1,8141 +0,0 @@
-\documentclass[b5paper]{book}
-\usepackage{hyperref}
-\usepackage{makeidx}
-\usepackage{amssymb}
-\usepackage{color}
-\usepackage{alltt}
-\usepackage{graphicx}
-\usepackage{layout}
-\def\union{\cup}
-\def\intersect{\cap}
-\def\getsrandom{\stackrel{\rm R}{\gets}}
-\def\cross{\times}
-\def\cat{\hspace{0.5em} \| \hspace{0.5em}}
-\def\catn{$\|$}
-\def\divides{\hspace{0.3em} | \hspace{0.3em}}
-\def\nequiv{\not\equiv}
-\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}}
-\def\lcm{{\rm lcm}}
-\def\gcd{{\rm gcd}}
-\def\log{{\rm log}}
-\def\ord{{\rm ord}}
-\def\abs{{\mathit abs}}
-\def\rep{{\mathit rep}}
-\def\mod{{\mathit\ mod\ }}
-\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})}
-\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor}
-\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil}
-\def\Or{{\rm\ or\ }}
-\def\And{{\rm\ and\ }}
-\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}}
-\def\implies{\Rightarrow}
-\def\undefined{{\rm ``undefined"}}
-\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}}
-\let\oldphi\phi
-\def\phi{\varphi}
-\def\Pr{{\rm Pr}}
-\newcommand{\str}[1]{{\mathbf{#1}}}
-\def\F{{\mathbb F}}
-\def\N{{\mathbb N}}
-\def\Z{{\mathbb Z}}
-\def\R{{\mathbb R}}
-\def\C{{\mathbb C}}
-\def\Q{{\mathbb Q}}
-\definecolor{DGray}{gray}{0.5}
-\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}}
-\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}}
-\def\gap{\vspace{0.5ex}}
-\makeindex
-\begin{document}
-\frontmatter
-\pagestyle{empty}
-\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - }
-\author{\mbox{
-%\begin{small}
-\begin{tabular}{c}
-Tom St Denis \\
-Algonquin College \\
-\\
-Mads Rasmussen \\
-Open Communications Security \\
-\\
-Greg Rose \\
-QUALCOMM Australia \\
-\end{tabular}
-%\end{small}
-}
-}
-\maketitle
-This text in its entirety is copyright \copyright{}2003 by Tom St Denis.  It may not be redistributed 
-electronically or otherwise without the sole permission of the author.  The text is freely redistributable as long as
-it is packaged along with the LibTomMath library in a non-commercial project.  Contact the
-author for other redistribution rights.
-
-This text corresponds to the v0.17 release of the LibTomMath project.
-
-\begin{alltt}
-Tom St Denis
-111 Banning Rd
-Ottawa, Ontario
-K2L 1C3
-Canada
-
-Phone: 1-613-836-3160
-Email: tomstdenis@iahu.ca
-\end{alltt}
-
-This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} 
-{\em book} macro package and the Perl {\em booker} package.
-
-\tableofcontents
-\listoffigures
-\chapter*{Preface}
-Blah.
-
-\mainmatter
-\pagestyle{headings}
-\chapter{Introduction}
-\section{Multiple Precision Arithmetic}
-\subsection{The Need for Multiple Precision Arithmetic}
-The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public
-key cryptography.   Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to 
-resist known cryptanalytic attacks.  Typical modern programming languages such as C and Java only provide small 
-single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long.
-
-For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type.  With an 
-x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$.  The original inputs 
-were approximately $21$ and $24$ bits respectively.  If the C language cannot multiply two relatively small values 
-together precisely how does anyone expect it to multiply two values that are considerably larger?
-
-Most advancements in fast multiple precision arithmetic stem from the desire for faster cryptographic primitives.  However, cryptography
-is not the only field of study that can benefit from fast large integer routines.  Another auxiliary use for multiple precision integers is 
-high precision floating point data types.  The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$.  
-Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is specified.  Since IEEE is meant to be implemented in 
-hardware the precision of the mantissa is often fairly small (\textit{23, 48 and 64 bits}).  Since the mantissa is merely an 
-integer a large multiple precision integer could be used.  In effect very high precision floating point arithmetic 
-could be performed.  This would be useful where scientific applications must minimize the total output error over long simulations.  
-
-\subsection{Multiple Precision Arithmetic}
-\index{multiple precision}
-Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from
-the C and Java programming languages.  In essence multiple precision arithmetic is a set of operations that can be 
-performed on members of an algebraic group whose precision is not fixed.  The algorithms when implemented to be multiple
-precision can allow a developer to work with any practical precision required.
-
-Typically the arithmetic over the ring of integers denoted by $\Z$ is performed by routines that are collectively and 
-casually referred to as ``bignum'' routines.  However, it is possible to have rings of polynomials as well typically 
-denoted by $\Z/p\Z \left [ X \right ]$ which could have variable precision (\textit{or degree}).  This text will 
-discuss implementation of the former, however implementing polynomial basis routines should be relatively easy after reading this text.
-
-\subsection{Benefits of Multiple Precision Arithmetic}
-\index{precision} \index{accuracy}
-Precision of the real value to a given precision is defined loosely as the proximity of the real value to a given representation.  
-Accuracy is defined as the reproducibility of the result.  For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided 
-it is reproducible.
-
-The benefit of multiple precision representations over single precision representations is that 
-often no precision is lost while representing the result of an operation which requires excess precision.  For example, 
-the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result.  A multiple precision 
-system would augment the precision of the destination to accomodate the result while a single precision system would
-truncate excess bits to maintain a fixed level of precision.
-
-Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of
-modest computer resources.  The only reasonable case where a multiple precision system will lose precision is when
-emulating a floating point data type.  However, with multiple precision integer arithmetic no precision is lost.
-
-\subsection{Basis of Operations}
-At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learned as children 
-in grade school.  For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for 
-$1,234$, instead they are taught how to long-multiply.  That is to multiply each column using simple single digit 
-multiplications, line up the partial results, and add the resulting products by column.  The representation that most 
-are familiar with is known as decimal or formally as radix-10. A radix-$n$ representation simply means there are 
-$n$ possible values per digit.  For example, binary would be a radix-2 representation.
-
-In essence computer based multiple precision arithmetic is very much the same.  The most notable difference is the usage
-of a binary friendly radix.  That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine 
-register.  Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and 
-squaring instead of traditional long-hand algorithms.
-
-\section{Purpose of This Text}
-The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms.  That is 
-to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by
-authors of other texts on the subject.  Texts such as \cite[HAC]{HAC} and \cite{TAOCPV2} give considerably detailed 
-explanations of the theoretical aspects of the algorithms and very little regarding the practical aspects.  
-
-How an algorithm is explained and how it is actually implemented are two very different 
-realities.  For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple 
-precision integer addition.  However, what the description lacks is any discussion concerning the fact that the two 
-integer inputs may be of differing magnitudes.  Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) 
-does not discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{Step \#3}).
-
-As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required 
-such as ``Comba'' and Karatsuba multipliers and fast modular inversion.  These optimal algorithms are vital to achieve 
-any form of useful performance in non-trivial applications.  
-
-To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that 
-constitute a multiple precision integer package with light discussions on the theoretical aspects.  As a case 
-study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate 
-algorithms with implementations that have been field tested and work very well.
-
-\section{Discussion and Notation}
-\subsection{Notation}
-A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the 
-multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$.  The elements of the array $x$ are
-said to be the radix $\beta$ digits of the integer.  For example, $x = (1,2,3)_{10}$ would represent the 
-integer $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$.  
-
-A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data
-required to manipulate the data.  These additional members are discussed in chapter three.  For the purposes of this text
-a ``multiple precision integer'' and a ``mp\_int'' are synonymous.
-
-\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word}
-For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while
-a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$.  Within the source code that will be
-presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a 
-double-precision type.  In several algorithms (\textit{notably the Comba routines}) temporary results 
-will be stored in a double-precision arrays.  For the purposes of this text $x_j$ will refer to the 
-$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision
-array.
-
-The $\lfloor \mbox{ } \rfloor$ brackets represent a value truncated and rounded down to the nearest integer.  The $\lceil \mbox{ } \rceil$ brackets 
-represent a value truncated and rounded up to the nearest integer.  Typically when the $/$ division symbol is used the intention is to perform an integer
-division.  For example, $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity.  When a value is presented as a fraction
-such as $5 \over 2$ a real value division is implied.
-
-\subsection{Work Effort}
-\index{big-O}
-To measure the efficiency of various algorithms a modified big-O notation is used.  In this system all 
-single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}.  
-That is a single precision addition, multiplication and division are assumed to take the same time to 
-complete.  While this is generally not true in practice it will simplify the discussions considerably.
-
-Some algorithms have slight advantages over others which is why some constants will not be removed in 
-the notation.  For example, a normal multiplication requires $O(n^2)$ work while a squaring requires 
-$O({{n^2 + n}\over 2})$ work.  In standard big-O notation these would be said to be equivalent.  However, in the 
-context of the this text the magnitude of the inputs will not approach an infinite size.  This means the conventional limit 
-notation wisdom does not apply to the cancellation of constants.
-
-Throughout the discussions various ``work levels'' will be discussed.  These levels are the $O(1)$,
-$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts.  For example, operations at the $O(n^k)$ ``level'' are said to be
-executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$.  Obviously most optimizations will pay
-off the most at the higher levels since they represent the bulk of the effort required.  
-
-\section{Exercises}
-Within the more advanced chapters a section will be set aside to give the reader some challenging exercises.  These exercises are not 
-designed to be prize winning problems, but to be thought provoking.  Wherever possible the problems are forward minded stating 
-problems that will be answered in subsequent chapters.  The reader is encouraged to finish the exercises as they appear to get a 
-better understanding of the subject material.  
-
-Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system.  However, unlike 
-\cite{TAOCPV2} the problems do not get nearly as hard as often.  The scoring of these exercises ranges from one (\textit{the easiest}) to
-five (\textit{the hardest}).  The following table sumarizes the scoring.
-
-\vspace{5mm}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\
-                     & minutes to solve.  Usually does not involve much computer time. \\
-                     & \\
-$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\
-                     & time usage.  Usually requires a program to be written to \\
-                     & solve the problem. \\
-                     & \\
-$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\
-                     & of work.  Usually involves trivial research and development of \\
-                     & new theory from the perspective of a student. \\
-                     & \\
-$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\
-                     & of work and research.  The solution to which will demonstrate \\
-                     & a higher mastery of the subject matter. \\
-                     & \\
-$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial.  \\
-                     & Solutions to these problems will demonstrate a complete mastery \\
-                     & of the given subject. \\
-                     & \\
-\end{tabular}
-
-Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or
-devising new theory.  These problems are quick tests to see if the material is understood.  Problems at the second level are also
-designed to be easy but will require a program or algorithm to be implemented to arrive at the answer.  
-
-Problems at the third level are meant to be a bit more difficult.  Often the answer is fairly obvious but arriving at an exacting solution
-requires some thought and skill.  These problems will almost always involve devising a new algorithm or implementing a variation of
-another algorithm.
-
-Problems at the fourth level are meant to be even more difficult as well as involve some research.  The reader will most likely not know
-the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}).  Problems
-at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter.  People who can correctly 
-answer fifth level problems have a mastery of the subject matter at hand.
-
-Often problems will be tied together.  The purpose of this is to start a chain of thought that will be discussed in future chapters.  The reader
-is encouraged to answer the follow-up problems and try to draw the relevence of problems.
-
-\chapter{Introduction to LibTomMath}
-
-\section{What is LibTomMath?}
-LibTomMath is a free and open source multiple precision library written in portable ISO C source code.  By portable it is 
-meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on any 
-given platform.  The library has been successfully tested under numerous operating systems including Solaris, MacOS, Windows, 
-Linux, PalmOS and on standalone hardware such as the Gameboy Advance.  The library is designed to contain enough 
-functionality to be able to develop applications such as public key cryptosystems.
-
-\section{Goals of LibTomMath}
-
-Even though the library is written entirely in portable ISO C considerable care has been taken to 
-optimize the algorithm implementations within the library.  Specifically the code has been written to work well with
-the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors.  Wherever possible highly efficient 
-algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction}) have 
-been provided to make the library as efficient as possible.  Even with the optimal and sometimes specialized 
-algorithms that have been included the Application Programing Interface (\textit{API}) has been kept as simple as possible.  
-Often generic place holder routines will make use of specialized algorithms automatically without the developer's
-attention.  One such example is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use 
-Karatsuba multiplication if the inputs are of a specific size.
-
-Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project.  Ideally the library should 
-be source compatible with another popular library which makes it more attractive for developers to use.  In this case the
-MPI library was used as a API template for all the basic functions.
-
-The project is also meant to act as a learning tool for students.  The logic being that no easy-to-follow ``bignum'' 
-library exists which can be used to teach computer science students how to perform fast and reliable multiple precision 
-arithmetic.  To this end the source code has been given quite a few comments and algorithm discussion points.  Often routines have 
-more comments than lines of code.
-
-\section{Choice of LibTomMath}
-LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but
-for more worthy reasons.  Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision 
-integer arithmetic routines but would not be ideal for this text for reasons as will be explained in the 
-following sub-sections.
-
-\subsection{Code Base}
-The LibTomMath code base is all portable ISO C source code.  This means that there are no platform dependent conditional
-segments of code littered throughout the source.  This clean and uncluttered approach to the library means that a
-developer can more readily ascertain the true intent of a given section of source code without trying to keep track of
-what conditional code will be used.
-
-The code base of LibTomMath is also well organized.  Each function is in its own separate source code file 
-which allows the reader to find a given function very fast.  When compiled with GCC for the x86 processor the entire 
-library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}).  This includes every single function 
-LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various 
-reduction algorithms and Jacobi symbol computation.  
-
-By comparison MPI which has fewer functions than LibTomMath compiled with the same conditions is 45,429 bytes 
-(\textit{$54,536$ for ARMv4}).  GMP which has rather large collection of functions with the default configuration on an 
-x86 Athlon is 2,950,688 bytes.  Note that while LibTomMath has fewer functions than GMP it has been used as the sole basis 
-for several public key cryptosystems without having to seek additional outside functions to supplement the library.
-
-\subsection{API Simplicity}
-LibTomMath is designed after the MPI library and shares the API design.  Quite often programs that use MPI will build 
-with LibTomMath without change. The function names are relatively straight forward as to what they perform.  Almost all of the 
-functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing 
-convention.  The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the 
-student and developer alike.  
-
-The LIP library is an example of a library with an API that is awkward to work with.  LIP uses function names that are often ``compressed'' to 
-illegible short hand.  LibTomMath does not share this fault.
-
-\subsection{Optimizations}
-While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does
-feature a set of optimal algorithms for tasks ranging from modular reduction to squaring.  GMP and LIP also feature
-such optimizations while MPI only uses baseline algorithms with no optimizations.
-
-LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular
-exponentiation.  In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually  
-slower than the best libraries such as GMP and OpenSSL by a small factor.
-
-\subsection{Portability and Stability}
-LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler 
-(\textit{GCC}).  This means that without changes the library will build without configuration or setting up any 
-variables.  LIP and MPI will build ``out of the box'' as well but have numerous known bugs.  Most notably the author of 
-MPI is not working on his library anymore.  
-
-GMP requires a configuration script to run and will not build out of the box.   GMP and LibTomMath are still in active
-development and are very stable across a variety of platforms.
-
-\subsection{Choice}
-LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for
-the case study of this text.  Various source files from the LibTomMath project will be included within the text.  However, the 
-reader is encouraged to download their own copy of the library to actually be able to work with the library.  
-
-\chapter{Getting Started}
-\section{Library Basics}
-To begin the design of a multiple precision integer library a primitive data type and a series of primitive algorithms must be established.  A data
-type that will hold the information required to maintain a multiple precision integer must be designed.  With this basic data type of a series
-of low level algorithms for initializing, clearing, growing and optimizing multiple precision integers can be developed to form the basis of 
-the entire library of algorithms.
-
-\section{What is a Multiple Precision Integer?}
-Recall that most programming languages (\textit{in particular C}) only have fixed precision data types that on their own cannot be used
-to represent values larger than their precision alone will allow. The purpose of multiple precision algorithms is to use these fixed precision
-data types to create multiple precision integers which may represent values that are much larger.  
-
-As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits.  In the decimal system
-the largest value is only $9$ since the digits may only have values from $0$ to $9$.  However, by concatenating digits together larger numbers 
-may be represented.  Computer based multiple precision arithmetic is essentially the same concept except with a different radix.
-
-What most people probably do not think about explicitly are the various other attributes that describe a multiple precision integer.  For example,
-the integer $154_{10}$ has two immediately obvious properties.  First, the integer is positive, that is the sign of this particular integer 
-is positive as oppose to negative.  Second, the integer has three digits in its representation.  There is an additional property that the integer 
-posesses that does not concern pencil-and-paper arithmetic.  The third property is how many digits are allowed for the integer.  
-
-The human analogy of this third property is ensuring there is enough space on the paper to right the integer.  Computers must maintain a
-strict control on memory usage with respect to the digits of a multiple precision integer.  These three properties make up what is known
-as a multiple precision integer or mp\_int for short.  
-
-\subsection{The mp\_int structure}
-The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer.  The ISO C standard does not provide for 
-any such data type but it does provide for making composite data types known as structures.  The following is the structure definition 
-used within LibTomMath.
-
-\index{mp\_int}
-\begin{verbatim}
-typedef struct  {
-    int used, alloc, sign;
-    mp_digit *dp;
-} mp_int;
-\end{verbatim}
-
-The mp\_int structure can be broken down as follows.
-
-\begin{enumerate}
-\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent
-a given integer.  The \textbf{used} count must not exceed the \textbf{alloc} count.  
-
-\item The array \textbf{dp} holds the digits that represent the given integer.  It is padded with $\textbf{alloc} - \textbf{used}$ zero
-digits.
-
-\item The \textbf{alloc} parameter denotes how 
-many digits are available in the array to use by functions before it has to increase in size.  When the \textbf{used} count 
-of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the 
-array to accommodate the precision of the result.  
-
-\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}).  
-\end{enumerate}
-
-\section{Argument Passing}
-A convention of argument passing must be adopted early on in the development of any library.  Making the function prototypes
-consistent will help eliminate many headaches in the future as the library grows to significant complexity.  In LibTomMath the multiple precision 
-integer functions accept parameters from left to right as pointers to mp\_int structures.  That means that the source operands are 
-placed on the left and the destination on the right.   Consider the following examples.
-
-\begin{verbatim}
-   mp_mul(&a, &b, &c);   /* c = a * b */
-   mp_add(&a, &b, &a);   /* a = a + b */
-   mp_sqr(&a, &b);       /* b = a * a */
-\end{verbatim}
-
-The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the
-functions and make sense of them.  For example, the first function would read ``multiply a and b and store in c''.
-
-Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around.  That is the destination
-on the left and arguments on the right.  In truth it is entirely a matter of preference.  In the case of LibTomMath the 
-convention from the MPI library has been adopted.  
-
-Another very useful design consideration is whether to allow argument sources to also be a destination.  For example, the
-second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$.  This is an important feature to implement since it
-allows the higher up functions to cut down on the number of variables.  However, to implement this feature specific
-care has to be given to ensure the destination is not modified before the source is fully read.
-
-\section{Return Values}
-A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the 
-caller.  By catching runtime errors a library can be guaranteed to prevent undefined behaviour.  In a multiple precision 
-library the only errors that can occur occur are related to inappropriate inputs (\textit{division by zero for instance}) or 
-memory allocation errors.
-
-In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the 
-following values.
-
-\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM}
-\begin{center}
-\begin{tabular}{|l|l|}
-\hline \textbf{Value} & \textbf{Meaning} \\
-\hline \textbf{MP\_OKAY} & The function was successful \\
-\hline \textbf{MP\_VAL}  & One of the input value(s) was invalid \\
-\hline \textbf{MP\_MEM}  & The function ran out of heap memory \\
-\hline
-\end{tabular}
-\end{center}
-
-When an error is detected within a function it should free any memory it allocated and return as soon as possible.  The goal
-is to leave the system in the same state the system was when the function was called.  Error checking with this style of API is fairly simple.
-
-\begin{verbatim}
-   int err;
-   if ((err = mp_add(&a, &b, &c)) != MP_OKAY) {
-      printf("Error: %d\n", err);
-      exit(EXIT_FAILURE);
-   }
-\end{verbatim}
-
-The GMP library uses C style \textit{signals} to flag errors which is of questionable use.  Not all errors are fatal 
-and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases.
-
-\section{Initialization and Clearing}
-The logical starting point when actually writing multiple precision integer functions is the initialization and 
-clearing of the integers.  These two functions will be used by far the most throughout the algorithms whenever 
-temporary integers are required.
-
-Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of
-the integer.  Often it is optimal to allocate a sufficiently large pre-set number of digits even considering
-the initial integer will represent zero.  If only a single digit were allocated quite a few re-allocations
-would occur for the majority of inputs.  There is a tradeoff between how many default digits to allocate
-and how many re-allocations are tolerable.  
-
-If the memory for the digits has been successfully allocated then the rest of the members of the structure must
-be initialized.  Since the initial state is to represent a zero integer the digits allocated must all be zeroed.  The
-\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}.
-
-\subsection{Initializing an mp\_int}
-To initialize an mp\_int the mp\_init algorithm shall be used.  The purpose of this algorithm is to allocate 
-the memory required and initialize the integer to a default representation of zero.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Allocate memory for the digits and set to a zero state. \\
-\hline \\
-1.  Allocate memory for \textbf{MP\_PREC} digits. \\
-2.  If the allocation failed then return(\textit{MP\_MEM}) \\
-3.  for $n$ from $0$ to $MP\_PREC - 1$ do  \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$\\
-4.  $a.sign \leftarrow MP\_ZPOS$\\
-5.  $a.used \leftarrow 0$\\
-6.  $a.alloc \leftarrow MP\_PREC$\\
-7.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init}
-\end{figure}
-
-\textbf{Algorithm mp\_init.}
-The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers.  It is ideally at least equal to $32$ but 
-can be any reasonable power of two.  Steps one and two allocate the memory and account for it.  If the allocation fails the algorithm returns
-immediately to signal the failure.  Step three will ensure that all the digits are in the default state of zero.  Finally steps 
-four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure.
-
-\index{bn\_mp\_init.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* init a new bigint */
-018   int
-019   mp_init (mp_int * a)
-020   \{
-021     /* allocate ram required and clear it */
-022     a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC);
-023     if (a->dp == NULL) \{
-024       return MP_MEM;
-025     \}
-026   
-027     /* set the used to zero, allocated digits to the default precision
-028      * and sign to positive */
-029     a->used  = 0;
-030     a->alloc = MP_PREC;
-031     a->sign  = MP_ZPOS;
-032   
-033     return MP_OKAY;
-034   \}
-\end{alltt}
-\end{small}
-
-The \textbf{OPT\_CAST} type cast on line 22 is designed to allow C++ compilers to build the code out of
-the box.  Microsoft C V5.00 is known to cause problems without the cast.  Also note that if the memory
-allocation fails the other members of the mp\_int will be in an undefined state.  The code from 
-line 29 to line 31 sets the default state for a mp\_int which is zero, positive and no used digits.
-
-\subsection{Clearing an mp\_int}
-When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with 
-the mp\_clear algorithm.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clear}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  The memory for $a$ is cleared. \\
-\hline \\
-1.  If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\
-2.  Free the digits of $a$ and mark $a$ as freed. \\
-3.  $a.used \leftarrow 0$ \\
-4.  $a.alloc \leftarrow 0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clear}
-\end{figure}
-
-\textbf{Algorithm mp\_clear.}
-In steps one and two the memory for the digits are only free'd if they had not been previously released before.  
-This is more of concern for the implementation since it is used to prevent ``double-free'' errors.  It also helps catch
-code errors where mp\_ints are used after being cleared.  Similarly steps three and four set the 
-\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging.  For example, if an mp\_int is expected
-to be non-zero and its \textbf{used} member is observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been
-spotted.
-
-\index{bn\_mp\_clear.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* clear one (frees)  */
-018   void
-019   mp_clear (mp_int * a)
-020   \{
-021     if (a->dp != NULL) \{
-022   
-023       /* first zero the digits */
-024       memset (a->dp, 0, sizeof (mp_digit) * a->used);
-025   
-026       /* free ram */
-027       free (a->dp);
-028   
-029       /* reset members to make debugging easier */
-030       a->dp = NULL;
-031       a->alloc = a->used = 0;
-032     \}
-033   \}
-\end{alltt}
-\end{small}
-
-The \textbf{if} statement on line 21 prevents the heap from being corrupted if a user double-frees an 
-mp\_int.  For example, a trivial case of this bug would be as follows.
-
-\begin{verbatim}
-mp_int a;
-mp_init(&a);
-mp_clear(&a);
-mp_clear(&a);
-\end{verbatim}
-
-Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C
-libraries to cause a fault.  Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently 
-free the mp\_int before it is truly not needed.  The allocated digits are set to zero before being freed on line 24.  
-This is ideal for cryptographic situations where the mp\_int is a secret parameter.
-
-The following snippet is an example of using both the init and clear functions.  
-
-\begin{small}
-\begin{verbatim}
-#include <tommath.h>
-#include <stdio.h>
-#include <stdlib.h>
-int main(void)
-{
-   mp_int num;
-   int err;
-   
-   /* init the bignum */
-   if ((err = mp_init(&num)) != MP_OKAY) {
-      printf("Error: %d\n", err);
-      return EXIT_FAILURE;
-   }
-   
-   /* do work with it ... */
-   
-   /* clear up */
-   mp_clear(&num);
-   
-   return EXIT_SUCCESS;
-}
-\end{verbatim}
-\end{small}
-
-\section{Other Initialization Routines}
-
-It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms.  For example, an 
-initialization followed by a copy is a common operation when temporary copies of integers are required.  It is quite
-beneficial to have a series of simple helper functions available.
-
-\subsection{Initializing Variable Sized mp\_int Structures}
-Occasionally the number of digits required will be known in advance of an initialization.  In these
-cases the mp\_init\_size algorithm can be of use.  The purpose of this algorithm is similar to mp\_init except that 
-it will allocate \textit{at least} a specified number of digits.  This is ideal to prevent re-allocations when the 
-input size is known.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_size}. \\
-\textbf{Input}.   An mp\_int $a$ and the requested number of digits $b$\\
-\textbf{Output}.  $a$ is initialized to hold at least $b$ digits. \\
-\hline \\
-1.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-2.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-3.  Allocate $v$ digits. \\
-4.  If the allocation failed then return(\textit{MP\_MEM}). \\
-5.  for $n$ from $0$ to $v - 1$ do \\
-\hspace{3mm}5.1  $a_n \leftarrow 0$ \\
-6.  $a.sign \leftarrow MP\_ZPOS$\\
-7.  $a.used \leftarrow 0$\\
-8.  $a.alloc \leftarrow v$\\
-9.  Return(\textit{MP\_OKAY})\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_size}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_size.}
-The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding.  The padding is calculated
-to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}.  This padding is used to 
-prevent trivial allocations from becoming a bottleneck in the rest of the algorithms that depend on this.
-
-\index{bn\_mp\_init\_size.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* init a mp_init and grow it to a given size */
-018   int
-019   mp_init_size (mp_int * a, int size)
-020   \{
-021   
-022     /* pad size so there are always extra digits */
-023     size += (MP_PREC * 2) - (size & (MP_PREC - 1));    
-024     
-025     /* alloc mem */
-026     a->dp = OPT_CAST calloc (sizeof (mp_digit), size);
-027     if (a->dp == NULL) \{
-028       return MP_MEM;
-029     \}
-030     a->used = 0;
-031     a->alloc = size;
-032     a->sign = MP_ZPOS;
-033   
-034     return MP_OKAY;
-035   \}
-\end{alltt}
-\end{small}
-
-Line 23 will ensure that the number of digits actually allocated is padded up to the next multiple of 
-\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}.  This ensures that the number of allocated digit is 
-always greater than the amount requested.  As a result it prevents many trivial memory allocations.  The value of 
-\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two.
-
-\subsection{Creating a Clone}
-Another common sequence of operations is to make a local temporary copy of an argument.  To initialize then copy a mp\_int will be known as 
-creating a clone.  This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy.  
-The mp\_init\_copy algorithm will perform this very task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_copy}. \\
-\textbf{Input}.   An mp\_int $a$ and $b$\\
-\textbf{Output}.  $a$ is initialized to be a copy of $b$. \\
-\hline \\
-1.  Init $a$.  (\textit{mp\_init}) \\
-2.  If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\
-3.  Copy $b$ to $a$.  (\textit{mp\_copy}) \\
-4.  Return the status of the copy operation. \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_copy.}
-This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it.  The algorithm will
-detect when the initialization fails and returns the error to the calling algorithm.  As such this algorithm will perform two operations
-in one step.  
-
-\index{bn\_mp\_init\_copy.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* creates "a" then copies b into it */
-018   int
-019   mp_init_copy (mp_int * a, mp_int * b)
-020   \{
-021     int     res;
-022   
-023     if ((res = mp_init (a)) != MP_OKAY) \{
-024       return res;
-025     \}
-026     return mp_copy (b, a);
-027   \}
-\end{alltt}
-\end{small}
-
-This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}.  Note that 
-\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call
-and \textbf{a} will be left intact.  
-
-\subsection{Multiple Integer Initializations And Clearings}
-Occasionally a function will require a series of mp\_int data types to be made available.  The mp\_init\_multi algorithm
-is provided to simplify such cases.  The purpose of this algorithm is to initialize a variable length array of mp\_int 
-structures at once.  As a result algorithms that require multiple integers only has to use 
-one algorithm to initialize all the mp\_int variables.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_init\_multi}. \\
-\textbf{Input}.   Variable length array of mp\_int variables of length $k$. \\
-\textbf{Output}.  The array is initialized such that each each mp\_int is ready to use. \\
-\hline \\
-1.  for $n$ from 0 to $k - 1$ do \\
-\hspace{+3mm}1.1.  Initialize the $n$'th mp\_int (\textit{mp\_init}) \\
-\hspace{+3mm}1.2.  If initialization failed then do \\
-\hspace{+6mm}1.2.1.  for $j$ from $0$ to $n$ do \\
-\hspace{+9mm}1.2.1.1.  Free the $j$'th mp\_int (\textit{mp\_clear}) \\
-\hspace{+6mm}1.2.2.   Return(\textit{MP\_MEM}) \\
-2.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_init\_multi}
-\end{figure}
-
-\textbf{Algorithm mp\_init\_multi.}
-The algorithm will initialize the array of mp\_int variables one at a time.  As soon as an runtime error is detected (\textit{step 1.2}) all of
-the previously initialized variables are cleared.  The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime 
-errors.
-
-Similarly to clear a variable length array of mp\_int structures the mp\_clear\_multi algorithm will be used.
-
-Consider the following snippet which demonstrates how to use both routines.
-\begin{small}
-\begin{verbatim}
-#include <tommath.h>
-#include <stdio.h>
-#include <stdlib.h>
-int main(void)
-{
-   mp_int num1, num2, num3;
-   int err;
-   
-   if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) {
-      printf("Error: %d\n", err);
-      return EXIT_FAILURE;
-   }
-   
-   /* at this point num1/num2/num3 are ready */
-   
-   /* free them */
-   mp_clear_multi(&num1, &num2, &num3, NULL);
-   
-   return EXIT_SUCCESS;
-}
-\end{verbatim}
-\end{small}
-
-Note how both lists are terminated with the \textbf{NULL} variable.  This indicates to the algorithms to stop fetching parameters off
-of the stack.  If it is not present the functions will most likely cause a segmentation fault.  
-
-\index{bn\_mp\_multi.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_multi.c
-\vspace{-3mm}
-\begin{alltt}
-016   #include <stdarg.h>
-017   
-018   int mp_init_multi(mp_int *mp, ...) 
-019   \{
-020       mp_err res = MP_OKAY;      /* Assume ok until proven otherwise */
-021       int n = 0;                 /* Number of ok inits */
-022       mp_int* cur_arg = mp;
-023       va_list args;
-024   
-025       va_start(args, mp);        /* init args to next argument from caller */
-026       while (cur_arg != NULL) \{
-027           if (mp_init(cur_arg) != MP_OKAY) \{
-028               /* Oops - error! Back-track and mp_clear what we already
-029                  succeeded in init-ing, then return error.
-030               */
-031               va_list clean_args;
-032               
-033               /* end the current list */
-034               va_end(args);
-035               
-036               /* now start cleaning up */            
-037               cur_arg = mp;
-038               va_start(clean_args, mp);
-039               while (n--) \{
-040                   mp_clear(cur_arg);
-041                   cur_arg = va_arg(clean_args, mp_int*);
-042               \}
-043               va_end(clean_args);
-044               res = MP_MEM;
-045               break;
-046           \}
-047           n++;
-048           cur_arg = va_arg(args, mp_int*);
-049       \}
-050       va_end(args);
-051       return res;                /* Assumed ok, if error flagged above. */
-052   \}
-053   
-054   void mp_clear_multi(mp_int *mp, ...) 
-055   \{
-056       mp_int* next_mp = mp;
-057       va_list args;
-058       va_start(args, mp);
-059       while (next_mp != NULL) \{
-060           mp_clear(next_mp);
-061           next_mp = va_arg(args, mp_int*);
-062       \}
-063       va_end(args);
-064   \}
-\end{alltt}
-\end{small}
-
-Both routines are implemented in the same source file since they are typically used in conjunction with each other.  
-
-\section{Maintenance}
-A small useful collection of mp\_int maintenance functions will also prove useful.  
-
-\subsection{Augmenting Integer Precision}
-When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without
-loss of precision.  Quite often the size of the array given by the \textbf{alloc} member is large enough to simply
-increase the \textbf{used} digit count.  However, when the size of the array is too small it must be re-sized 
-appropriately to accomodate the result.  The mp\_grow algorithm will provide this functionality.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_grow}. \\
-\textbf{Input}.   An mp\_int $a$ and an integer $b$. \\
-\textbf{Output}.  $a$ is expanded to accomodate $b$ digits. \\
-\hline \\
-1.  if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\
-2.  $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\
-3.  $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\
-4.  Re-Allocate the array of digits $a$ to size $v$ \\
-5.  If the allocation failed then return(\textit{MP\_MEM}). \\
-6.  for n from a.alloc to $v - 1$ do  \\
-\hspace{+3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.alloc \leftarrow v$ \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_grow}
-\end{figure}
-
-\textbf{Algorithm mp\_grow.}
-Step one will prevent a re-allocation from being performed if it was not required.  This is useful to prevent mp\_ints
-from growing excessively in code that erroneously calls mp\_grow.  Similar to mp\_init\_size the requested digit count
-is padded to provide more digits than requested.  
-
-In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact.  This is much akin to how the 
-\textit{realloc} function from the standard C library works.  Since the newly allocated digits are assumed to contain
-undefined values they are also initially zeroed.
-
-\index{bn\_mp\_grow.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* grow as required */
-018   int
-019   mp_grow (mp_int * a, int size)
-020   \{
-021     int     i;
-022   
-023     /* if the alloc size is smaller alloc more ram */
-024     if (a->alloc < size) \{
-025       /* ensure there are always at least MP_PREC digits extra on top */
-026       size += (MP_PREC * 2) - (size & (MP_PREC - 1));     
-027   
-028       a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size);
-029       if (a->dp == NULL) \{
-030         return MP_MEM;
-031       \}
-032   
-033       /* zero excess digits */
-034       i        = a->alloc;
-035       a->alloc = size;
-036       for (; i < a->alloc; i++) \{
-037         a->dp[i] = 0;
-038       \}
-039     \}
-040     return MP_OKAY;
-041   \}
-\end{alltt}
-\end{small}
-
-The first step is to see if we actually need to perform a re-allocation at all.  This is tested for on line 
-24.  Similar to mp\_init\_size the same code on line 26 was used to resize the 
-digits requested.  A simple for loop from line 34 to line 38 will zero all digits that were above the 
-old \textbf{alloc} limit to make sure the integer is in a known state.
-
-\subsection{Clamping Excess Digits}
-When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of 
-the function.  For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most 
-$i + j$ digits.  It is entirely possible that the result is $i + j - 1$ though, with no final carry into the last 
-position.  However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j - 1$
-digits than further expanded to accomodate the final carry.  That would be a considerable waste of time since heap
-operations are relatively slow.
-
-The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function
-terminates.  This way a single heap operation (\textit{at most}) is required.  However, if the result was not checked
-there would be an excess high order zero digit.  
-
-For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$.  The leading zero digit 
-will not contribute to the precision of the result.  In fact, through subsequent operations more leading zero digits would
-accumulate to the point the size of the integer would be prohibitive.  As a result even though the precision is very 
-low the representation is excessively large.  
-
-The mp\_clamp algorithm is designed to solve this very problem.  It will trim leading zeros by decrementing the 
-\textbf{used} count until a non-zero leading digit is found.  Also in this system, zero is considered to be a positive 
-number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_clamp}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Any excess leading zero digits of $a$ are removed \\
-\hline \\
-1.  while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\
-\hspace{+3mm}1.1  $a.used \leftarrow a.used - 1$ \\
-2.  if $a.used = 0$ then do \\
-\hspace{+3mm}2.1  $a.sign \leftarrow MP\_ZPOS$ \\
-\hline \\
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_clamp}
-\end{figure}
-
-\textbf{Algorithm mp\_clamp.}
-As can be expected this algorithm is very simple.  The loop on step one is expected to iterate only once or twice at
-the most.  For example, this will happen in cases where there is not a carry to fill the last position.  Step two fixes the sign for 
-when all of the digits are zero to ensure that the mp\_int is valid at all times.
-
-\index{bn\_mp\_clamp.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* trim unused digits 
-018    *
-019    * This is used to ensure that leading zero digits are
-020    * trimed and the leading "used" digit will be non-zero
-021    * Typically very fast.  Also fixes the sign if there
-022    * are no more leading digits
-023    */
-024   void
-025   mp_clamp (mp_int * a)
-026   \{
-027     while (a->used > 0 && a->dp[a->used - 1] == 0) \{
-028       --(a->used);
-029     \}
-030     if (a->used == 0) \{
-031       a->sign = MP_ZPOS;
-032     \}
-033   \}
-\end{alltt}
-\end{small}
-
-Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator.  In the C programming
-language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails.  This is 
-important since if the \textbf{used} is zero the test on the right would fetch below the array.  That is obviously 
-undesirable.  The parenthesis on line 28 is used to make sure the \textbf{used} count is decremented and not
-the pointer ``a''.  
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations.  \\
-                     & \\
-$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\
-                     & encryption when $\beta = 2^{28}$.  \\
-                     & \\
-$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp.  What does it prevent? \\
-                     & \\
-$\left [ 1 \right ]$ & Give an example of when the algorithm  mp\_init\_copy might be useful. \\
-                     & \\
-\end{tabular}
-
-
-\chapter{Basic Operations}
-\section{Copying an Integer}
-After the various house-keeping routines are in place, simple algorithms can be designed to take advantage of them.  Being able
-to make a verbatim copy of an integer is a very useful function to have.  To copy an integer the mp\_copy algorithm will be used.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_copy}. \\
-\textbf{Input}.  An mp\_int $a$ and $b$. \\
-\textbf{Output}.  Store a copy of $a$ in $b$. \\
-\hline \\
-1.  Check if $a$ and $b$ point to the same location in memory. \\
-2.  If true then return(\textit{MP\_OKAY}). \\
-3.  If $b.alloc < a.used$ then grow $b$ to $a.used$ digits.  (\textit{mp\_grow}) \\
-4.  If failed to grow then return(\textit{MP\_MEM}). \\
-5.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}5.1  $b_{n} \leftarrow a_{n}$ \\
-6.  if $a.used < b.used - 1$ then \\ 
-\hspace{3mm}6.1.  for $n$ from $a.used$ to $b.used - 1$ do \\
-\hspace{6mm}6.1.1  $b_{n} \leftarrow 0$ \\
-7.  $b.used \leftarrow a.used$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_copy}
-\end{figure}
-
-\textbf{Algorithm mp\_copy.}
-Step 1 and 2 make sure that the two mp\_ints are unique.  This allows the user to call the copy function with
-potentially the same input and not waste time.  Step 3 and 4 ensure that the destination is large enough to
-hold a copy of the input $a$.  Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used}
-member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller.  This
-prevents trivial memory reallocations.
-
-Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$,
-the more significant digits of $b$ will be zeroed.  Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over 
-which completes the copy operation.
-
-\index{bn\_mp\_copy.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* copy, b = a */
-018   int
-019   mp_copy (mp_int * a, mp_int * b)
-020   \{
-021     int     res, n;
-022   
-023     /* if dst == src do nothing */
-024     if (a == b) \{
-025       return MP_OKAY;
-026     \}
-027   
-028     /* grow dest */
-029     if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-030       return res;
-031     \}
-032   
-033     /* zero b and copy the parameters over */
-034     \{
-035       register mp_digit *tmpa, *tmpb;
-036   
-037       /* pointer aliases */
-038       tmpa = a->dp;
-039       tmpb = b->dp;
-040   
-041       /* copy all the digits */
-042       for (n = 0; n < a->used; n++) \{
-043         *tmpb++ = *tmpa++;
-044       \}
-045   
-046       /* clear high digits */
-047       for (; n < b->used; n++) \{
-048         *tmpb++ = 0;
-049       \}
-050     \}
-051     b->used = a->used;
-052     b->sign = a->sign;
-053     return MP_OKAY;
-054   \}
-\end{alltt}
-\end{small}
-
-Source lines 23-31 do the initial house keeping.  That is to see if the input is unique and if so to 
-make sure there is enough room.  If not enough space is available it returns the error and leaves the destination variable
-intact.
-
-The inner loop of the copy operation is contained between lines 34 and 50.  Many LibTomMath routines are designed with this source code style
-in mind, making aliases to shorten lengthy pointers (\textit{see line 38 and 39}) for rapid use.  Also the
-use of nested braces creates a simple way to denote various portions of code that reside on various work levels.  Here, the copy loop is at the 
-$O(n)$ level.  
-
-\section{Zeroing an Integer}
-Reseting an mp\_int to the default state is a common step in many algorithms.  The mp\_zero algorithm will be the algorithm used to
-perform this task.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_zero}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Zero the contents of $a$ \\
-\hline \\
-1.  $a.used \leftarrow 0$ \\
-2.  $a.sign \leftarrow$ MP\_ZPOS \\
-3.  for $n$ from 0 to $a.alloc - 1$ do \\
-\hspace{3mm}3.1  $a_n \leftarrow 0$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_zero}
-\end{figure}
-
-\textbf{Algorithm mp\_zero.}
-This algorithm simply resets a mp\_int to the default state.  
-
-\index{bn\_mp\_zero.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* set to zero */
-018   void
-019   mp_zero (mp_int * a)
-020   \{
-021     a->sign = MP_ZPOS;
-022     a->used = 0;
-023     memset (a->dp, 0, sizeof (mp_digit) * a->alloc);
-024   \}
-\end{alltt}
-\end{small}
-
-After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the 
-\textbf{sign} variable is set to \textbf{MP\_ZPOS}.
-
-\section{Sign Manipulation}
-\subsection{Absolute Value}
-With the mp\_int representation of an integer, calculating the absolute value is trivial.  The mp\_abs algorithm will compute
-the absolute value of an mp\_int.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_abs}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = \vert a \vert$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  $b.sign \leftarrow MP\_ZPOS$ \\
-4.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_abs}
-\end{figure}
-
-\textbf{Algorithm mp\_abs.}
-This algorithm computes the absolute of an mp\_int input.  As can be expected the algorithm is very trivial.
-
-\index{bn\_mp\_abs.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* b = |a| 
-018    *
-019    * Simple function copies the input and fixes the sign to positive
-020    */
-021   int
-022   mp_abs (mp_int * a, mp_int * b)
-023   \{
-024     int     res;
-025     if ((res = mp_copy (a, b)) != MP_OKAY) \{
-026       return res;
-027     \}
-028     b->sign = MP_ZPOS;
-029     return MP_OKAY;
-030   \}
-\end{alltt}
-\end{small}
-
-\subsection{Integer Negation}
-With the mp\_int representation of an integer, calculating the negation is also trivial.  The mp\_neg algorithm will compute
-the negative of an mp\_int input.
-
-\newpage\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_neg}. \\
-\textbf{Input}.   An mp\_int $a$ \\
-\textbf{Output}.  Computes $b = -a$ \\
-\hline \\
-1.  Copy $a$ to $b$.  (\textit{mp\_copy}) \\
-2.  If the copy failed return(\textit{MP\_MEM}). \\
-3.  If $a.sign = MP\_ZPOS$ then do \\
-\hspace{3mm}3.1  $b.sign = MP\_NEG$. \\
-4.  else do \\
-\hspace{3mm}4.1  $b.sign = MP\_ZPOS$. \\
-5.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_neg}
-\end{figure}
-
-\textbf{Algorithm mp\_neg.}
-This algorithm computes the negation of an input.  
-
-\index{bn\_mp\_neg.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* b = -a */
-018   int
-019   mp_neg (mp_int * a, mp_int * b)
-020   \{
-021     int     res;
-022     if ((res = mp_copy (a, b)) != MP_OKAY) \{
-023       return res;
-024     \}
-025     b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-026     return MP_OKAY;
-027   \}
-\end{alltt}
-\end{small}
-
-\section{Small Constants}
-\subsection{Setting Small Constants}
-Often a mp\_int must be set to a relatively small value such as $1$ or $2$.  For these cases the mp\_set algorithm is useful.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set}. \\
-\textbf{Input}.   An mp\_int $a$ and a digit $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}). \\
-2.  $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\
-3.  $a.used \leftarrow  \left \lbrace \begin{array}{ll}
-                              1 &  \mbox{if }a_0 > 0 \\
-                              0 &  \mbox{if }a_0 = 0 
-                              \end{array} \right .$ \\
-\hline                              
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set}
-\end{figure}
-
-\textbf{Algorithm mp\_set.}
-This algorithm sets a mp\_int to a small single digit value.  Step number 1 ensures that the integer is reset to the default state.  The
-single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly.
-
-\index{bn\_mp\_set.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* set to a digit */
-018   void
-019   mp_set (mp_int * a, mp_digit b)
-020   \{
-021     mp_zero (a);
-022     a->dp[0] = b & MP_MASK;
-023     a->used = (a->dp[0] != 0) ? 1 : 0;
-024   \}
-\end{alltt}
-\end{small}
-
-Line 21 calls mp\_zero() to clear the mp\_int and reset the sign.  Line 22 copies the digit 
-into the least significant location.  Note the usage of a new constant \textbf{MP\_MASK}.  This constant is used to quickly
-reduce an integer modulo $\beta$.  Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with 
-$MP\_MASK = 2^k - 1$ to perform the reduction.  Finally line 23 will set the \textbf{used} member with respect to the 
-digit actually set. This function will always make the integer positive.
-
-One important limitation of this function is that it will only set one digit.  The size of a digit is not fixed, meaning source that uses 
-this function should take that into account.  Meaning that only trivially small constants can be set using this function.
-
-\subsection{Setting Large Constants}
-To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided.  It accepts a ``long''
-data type as input and will always treat it as a 32-bit integer.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_set\_int}. \\
-\textbf{Input}.   An mp\_int $a$ and a ``long'' integer $b$ \\
-\textbf{Output}.  Make $a$ equivalent to $b$ \\
-\hline \\
-1.  Zero $a$ (\textit{mp\_zero}) \\
-2.  for $n$ from 0 to 7 do \\
-\hspace{3mm}2.1  $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\
-\hspace{3mm}2.2  $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\
-\hspace{3mm}2.3  $a_0 \leftarrow a_0 + u$ \\
-\hspace{3mm}2.4  $a.used \leftarrow a.used + 1$ \\
-3.  Clamp excess used digits (\textit{mp\_clamp}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_set\_int}
-\end{figure}
-
-\textbf{Algorithm mp\_set\_int.}
-The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the 
-mp\_int.  Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions.  In step 2.2 the
-next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is 
-incremented to reflect the addition.  The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have
-zero digits used and the newly added four bits would be ignored.
-
-Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp.
-
-\index{bn\_mp\_set\_int.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* set a 32-bit const */
-018   int
-019   mp_set_int (mp_int * a, unsigned int b)
-020   \{
-021     int     x, res;
-022   
-023     mp_zero (a);
-024     /* set four bits at a time */
-025     for (x = 0; x < 8; x++) \{
-026       /* shift the number up four bits */
-027       if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{
-028         return res;
-029       \}
-030   
-031       /* OR in the top four bits of the source */
-032       a->dp[0] |= (b >> 28) & 15;
-033   
-034       /* shift the source up to the next four bits */
-035       b <<= 4;
-036   
-037       /* ensure that digits are not clamped off */
-038       a->used += 1;
-039     \}
-040     mp_clamp (a);
-041     return MP_OKAY;
-042   \}
-\end{alltt}
-\end{small}
-
-This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes.  The weird
-addition on line 38 ensures that the newly added in bits are added to the number of digits.  While it may not 
-seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 
-as well as the  call to mp\_clamp() on line 40.  Both functions will clamp excess leading digits which keeps 
-the number of used digits low.
-
-\section{Comparisons}
-\subsection{Unsigned Comparisions}
-Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers.  For example,
-to compare $1,234$ to $1,264$ the digits are extracted by their positions.  That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$
-to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude 
-positions.  If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater.  
-
-The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two
-mp\_int variables alone.  It will ignore the sign of the two inputs.  Such a function is useful when an absolute comparison is required or if the 
-signs are known to agree in advance.
-
-To facilitate working with the results of the comparison functions three constants are required.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|r|l|}
-\hline \textbf{Constant} & \textbf{Meaning} \\
-\hline \textbf{MP\_GT} & Greater Than \\
-\hline \textbf{MP\_EQ} & Equal To \\
-\hline \textbf{MP\_LT} & Less Than \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Comparison Return Codes}
-\end{figure}
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp\_mag}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$.  \\
-\textbf{Output}.  Unsigned comparison results ($a$ to the left of $b$). \\
-\hline \\
-1.  If $a.used > b.used$ then return(\textit{MP\_GT}) \\
-2.  If $a.used < b.used$ then return(\textit{MP\_LT}) \\
-3.  for n from $a.used - 1$ to 0 do \\
-\hspace{+3mm}3.1  if $a_n > b_n$ then return(\textit{MP\_GT}) \\
-\hspace{+3mm}3.2  if $a_n < b_n$ then return(\textit{MP\_LT}) \\
-4.  Return(\textit{MP\_EQ}) \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp\_mag}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp\_mag.}
-By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return
-\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$.  The first two steps compare the number of digits used in both $a$ and $b$.  
-Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is.  
-If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit.  
-
-By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to
-the zero'th digit.  If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}.
-
-\index{bn\_mp\_cmp\_mag.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* compare maginitude of two ints (unsigned) */
-018   int
-019   mp_cmp_mag (mp_int * a, mp_int * b)
-020   \{
-021     int     n;
-022   
-023     /* compare based on # of non-zero digits */
-024     if (a->used > b->used) \{
-025       return MP_GT;
-026     \} 
-027     
-028     if (a->used < b->used) \{
-029       return MP_LT;
-030     \}
-031   
-032     /* compare based on digits  */
-033     for (n = a->used - 1; n >= 0; n--) \{
-034       if (a->dp[n] > b->dp[n]) \{
-035         return MP_GT;
-036       \} 
-037       
-038       if (a->dp[n] < b->dp[n]) \{
-039         return MP_LT;
-040       \}
-041     \}
-042     return MP_EQ;
-043   \}
-\end{alltt}
-\end{small}
-
-The two if statements on lines 24 and 28 compare the number of digits in the two inputs.  These two are performed before all of the digits
-are compared since it is a very cheap test to perform and can potentially save considerable time.  The implementation given is also not valid 
-without those two statements.  $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the 
-array of digits.
-
-\subsection{Signed Comparisons}
-Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}).  Based on an unsigned magnitude 
-comparison a trivial signed comparison algorithm can be written.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_cmp}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  Signed Comparison Results ($a$ to the left of $b$) \\
-\hline \\
-1.  if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\
-2.  if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\
-3.  if $a.sign = MP\_NEG$ then \\
-\hspace{+3mm}3.1  Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\
-4   Otherwise \\
-\hspace{+3mm}4.1  Return the unsigned comparison of $a$ and $b$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_cmp}
-\end{figure}
-
-\textbf{Algorithm mp\_cmp.}
-The first two steps compare the signs of the two inputs.  If the signs do not agree then it can return right away with the appropriate 
-comparison code.  When the signs are equal the digits of the inputs must be compared to determine the correct result.  In step 
-three the unsigned comparision flips the order of the arguments since they are both negative.  For instance, if $-a > -b$ then 
-$\vert a \vert < \vert b \vert$.  Step number four will compare the two when they are both positive.
-
-\index{bn\_mp\_cmp.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* compare two ints (signed)*/
-018   int
-019   mp_cmp (mp_int * a, mp_int * b)
-020   \{
-021     /* compare based on sign */
-022     if (a->sign == MP_NEG && b->sign == MP_ZPOS) \{
-023       return MP_LT;
-024     \} 
-025     
-026     if (a->sign == MP_ZPOS && b->sign == MP_NEG) \{
-027       return MP_GT;
-028     \}
-029     
-030     /* compare digits */
-031     if (a->sign == MP_NEG) \{
-032        /* if negative compare opposite direction */
-033        return mp_cmp_mag(b, a);
-034     \} else \{
-035        return mp_cmp_mag(a, b);
-036     \}
-037   \}
-\end{alltt}
-\end{small}
-
-The two if statements on lines 22 and 26 perform the initial sign comparison.  If the signs are not the equal then which ever
-has the positive sign is larger.   At line 31, the inputs are compared based on magnitudes.  If the signs were both negative then 
-the unsigned comparison is performed in the opposite direction (\textit{line 33}).  Otherwise, the signs are assumed to 
-be both positive and a forward direction unsigned comparison is performed.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\
-                     & \\
-$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits  \\
-                     & of two random digits (of equal magnitude) before a difference is found. \\
-                     & \\
-$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based  \\
-                     & on the observations made in the previous problem. \\
-                     &
-\end{tabular}
-
-\chapter{Basic Arithmetic}
-\section{Building Blocks}
-At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been 
-established.  The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms.  These 
-algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms.  It is very important 
-that these algorithms are highly optimized.  On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms 
-which easily places them at $O(n^2)$ or even $O(n^3)$ work levels.  
-
-All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right 
-logical shifts respectively.  A logical shift is analogous to sliding the decimal point of radix-10 representations.  For example, the real 
-number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}).  
-Mathematically a logical shift is equivalent to a division or multiplication by a power of two.  
-For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$.
-
-One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed
-from the number.  For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$.  However, with a logical shift the 
-result is $110_2$.  
-
-\section{Addition and Subtraction}
-In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus.  For example, with 32-bit integers
-$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$  since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$.  
-As a result subtraction can be performed with a trivial series of logical operations and an addition.
-
-However, in multiple precision arithmetic negative numbers are not represented in the same way.  Instead a sign flag is used to keep track of the
-sign of the integer.  As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or 
-subtraction algorithms with the sign fixed up appropriately.
-
-The lower level algorithms will add or subtract integers without regard to the sign flag.  That is they will add or subtract the magnitude of
-the integers respectively.
-
-\subsection{Low Level Addition}
-An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers.  That is to add the 
-trailing digits first and propagate the resulting carry upwards.  Since this is a lower level algorithm the name will have a ``s\_'' prefix.  
-Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely.
-
-\newpage
-\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ \\
-\textbf{Output}.  The unsigned addition $c = \vert a \vert + \vert b \vert$. \\
-\hline \\
-1.  if $a.used > b.used$ then \\
-\hspace{+3mm}1.1  $min \leftarrow b.used$ \\
-\hspace{+3mm}1.2  $max \leftarrow a.used$ \\
-\hspace{+3mm}1.3  $x   \leftarrow a$ \\
-2.  else  \\
-\hspace{+3mm}2.1  $min \leftarrow a.used$ \\
-\hspace{+3mm}2.2  $max \leftarrow b.used$ \\
-\hspace{+3mm}2.3  $x   \leftarrow b$ \\
-3.  If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\
-4.  If failed to grow $c$ return(\textit{MP\_MEM}) \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow max + 1$ \\
-7.  $u \leftarrow 0$ \\
-8.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{+3mm}8.1  $c_n \leftarrow a_n + b_n + u$ \\
-\hspace{+3mm}8.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  if $min \ne max$ then do \\
-\hspace{+3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{+6mm}9.1.1  $c_n \leftarrow x_n + u$ \\
-\hspace{+6mm}9.1.2  $u \leftarrow c_n >> lg(\beta)$ \\
-\hspace{+6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-10.  $c_{max} \leftarrow u$ \\
-11.  if $olduse > max$ then \\
-\hspace{+3mm}11.1  for $n$ from $max + 1$ to $olduse - 1$ do \\
-\hspace{+6mm}11.1.1  $c_n \leftarrow 0$ \\
-12.  Clamp excess digits in $c$.  (\textit{mp\_clamp}) \\
-13.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_add}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_add.}
-This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes.  
-Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}.  Even the 
-MIX pseudo  machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes.
-
-Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count.  This allows the inputs to have varying magnitudes which not 
-only makes it more efficient than the trivial algorithm presented in the references but more flexible.  The variable $min$ is given the lowest 
-digit count while $max$ is given the highest digit count.  If both inputs have the same \textbf{used} digit count both $min$ and $max$ are 
-set to the same value.  The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it.  After the inputs are sorted, 
-steps 3 and 4 will ensure that the destination $c$ can accommodate the result.  The old \textbf{used} count from $c$ is copied to 
-$oldused$ so that excess digits can be cleared later, and the new \textbf{used} count is set to $max+1$, so that a carry from the most significant 
-word can be handled.
-
-At step 7 the carry variable $u$ is set to zero and the first part of the addition loop can begin.  The first step of the loop (\textit{8.1}) adds
-digits from the two inputs together along with the carry variable $u$.  The following step extracts the carry bit by shifting the result of the
-preceding step right by $lg(\beta)$ positions.  The shift to extract the carry is similar to how carry extraction works with decimal addition.
-
-Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$.  The trailing digit of the result
-is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$.  The
-division and multiplication of $10$ is simply a logical right or left shift, respectively, of the digits.  In otherwords the carry can be extracted
-by shifting one digit to the right.
-
-Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$.  This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ 
-digit.  Therefore, a logical shift right of the summand by $lg(\beta)$ will extract the carry.  The final step of the loop reduces the digit 
-modulo the radix $\beta$ to ensure it is in range.
-
-After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted.  Step 9 decides whether
-the inputs were of equal magnitude.  If not than another loop similar to that in step 8, must be executed.  The loop at step
-number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry.  
-
-Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$.  Step 11 ensures that 
-leading digits that were originally present in $c$ are cleared.  Finally excess leading digits are clamped and the algorithm returns success.
-
-\index{bn\_s\_mp\_add.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* low level addition, based on HAC pp.594, Algorithm 14.7 */
-018   int
-019   s_mp_add (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     mp_int *x;
-022     int     olduse, res, min, max;
-023   
-024     /* find sizes, we let |a| <= |b| which means we have to sort
-025      * them.  "x" will point to the input with the most digits
-026      */
-027     if (a->used > b->used) \{
-028       min = b->used;
-029       max = a->used;
-030       x = a;
-031     \} else \{
-032       min = a->used;
-033       max = b->used;
-034       x = b;
-035     \}
-036   
-037     /* init result */
-038     if (c->alloc < max + 1) \{
-039       if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043   
-044     /* get old used digit count and set new one */
-045     olduse = c->used;
-046     c->used = max + 1;
-047   
-048     \{
-049       register mp_digit u, *tmpa, *tmpb, *tmpc;
-050       register int i;
-051   
-052       /* alias for digit pointers */
-053   
-054       /* first input */
-055       tmpa = a->dp;
-056   
-057       /* second input */
-058       tmpb = b->dp;
-059   
-060       /* destination */
-061       tmpc = c->dp;
-062   
-063       /* zero the carry */
-064       u = 0;
-065       for (i = 0; i < min; i++) \{
-066         /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */
-067         *tmpc = *tmpa++ + *tmpb++ + u;
-068   
-069         /* U = carry bit of T[i] */
-070         u = *tmpc >> ((mp_digit)DIGIT_BIT);
-071   
-072         /* take away carry bit from T[i] */
-073         *tmpc++ &= MP_MASK;
-074       \}
-075   
-076       /* now copy higher words if any, that is in A+B 
-077        * if A or B has more digits add those in 
-078        */
-079       if (min != max) \{
-080         for (; i < max; i++) \{
-081           /* T[i] = X[i] + U */
-082           *tmpc = x->dp[i] + u;
-083   
-084           /* U = carry bit of T[i] */
-085           u = *tmpc >> ((mp_digit)DIGIT_BIT);
-086   
-087           /* take away carry bit from T[i] */
-088           *tmpc++ &= MP_MASK;
-089         \}
-090       \}
-091   
-092       /* add carry */
-093       *tmpc++ = u;
-094   
-095       /* clear digits above oldused */
-096       for (i = c->used; i < olduse; i++) \{
-097         *tmpc++ = 0;
-098       \}
-099     \}
-100   
-101     mp_clamp (c);
-102     return MP_OKAY;
-103   \}
-\end{alltt}
-\end{small}
-
-Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables.  Note that $x$ is a pointer to a 
-mp\_int assigned to the largest input, in effect it is a local alias.  Lines 37 to 42 ensure that the destination is grown to 
-accomodate the result of the addition. 
-
-Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style.  The three aliases that are on 
-lines 55, 58 and 61 represent the two inputs and destination variables respectively.  These aliases are used to ensure the
-compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int.
-
-The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the 
-implementation.  The initial addition loop begins on line 65 and ends on line 74.  Similarly the conditional addition loop
-begins on line 80 and ends on line 90.  The addition is finished with the final carry being stored in $tmpc$ on line 93.  
-Note the ``++'' operator on the same line.  After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$.  This is useful
-for the next loop on lines 96 to 99 which set any old upper digits to zero.
-
-\subsection{Low Level Subtraction}
-The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm.  The principle difference is that the
-unsigned subtraction algorithm requires the result to be positive.  That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must 
-be met for this algorithm to function properly.  Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly.  
-This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms.
-
-
-For this algorithm a new variable is required to make the description simpler.  Recall from section 1.3.1 that a mp\_digit must be able to represent
-the range $0 \le x < 2\beta$ for the algorithms to work correctly.  However, it is allowable that a mp\_digit represent a larger range of values.  For 
-this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a 
-mp\_digit (\textit{this implies $2^{\gamma} > \beta$}).  
-
-For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$.  In ISO C an ``unsigned long''
-data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$.
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{small}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\
-\textbf{Output}.  The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\
-\hline \\
-1.  $min \leftarrow b.used$ \\
-2.  $max \leftarrow a.used$ \\
-3.  If $c.alloc < max$ then grow $c$ to hold at least $max$ digits.  (\textit{mp\_grow}) \\
-4.  If the reallocation failed return(\textit{MP\_MEM}). \\
-5.  $oldused \leftarrow c.used$ \\ 
-6.  $c.used \leftarrow max$ \\
-7.  $u \leftarrow 0$ \\
-8.  for $n$ from $0$ to $min - 1$ do \\
-\hspace{3mm}8.1  $c_n \leftarrow a_n - b_n - u$ \\
-\hspace{3mm}8.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{3mm}8.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-9.  if $min < max$ then do \\
-\hspace{3mm}9.1  for $n$ from $min$ to $max - 1$ do \\
-\hspace{6mm}9.1.1  $c_n \leftarrow a_n - u$ \\
-\hspace{6mm}9.1.2  $u   \leftarrow c_n >> (\gamma - 1)$ \\
-\hspace{6mm}9.1.3  $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\
-10. if $oldused > max$ then do \\
-\hspace{3mm}10.1  for $n$ from $max$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11. Clamp excess digits of $c$.  (\textit{mp\_clamp}). \\
-12. Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Algorithm s\_mp\_sub}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sub.}
-This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive.  That is when
-passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly.  This
-algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well.  As was the case
-of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude.
-
-The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$.  Steps 1 and 2 
-set the $min$ and $max$ variables.  Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at 
-most $max$ digits in length as opposed to $max + 1$.  Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and 
-set to the maximal count for the operation.
-
-The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision 
-subtraction is used instead.  Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction 
-loops.  Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry.  
-
-For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$.  The least significant bit will force a carry upwards to 
-the third bit which will be set to zero after the borrow.  After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain,  When the 
-third bit of $0101_2$ is subtracted from the result it will cause another carry.  In this case though the carry will be forced to propagate all the 
-way to the most significant bit.  
-
-Recall that $\beta < 2^{\gamma}$.  This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most 
-significant bit.  Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that
-is needed is a single zero or one bit for the carry.  Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the 
-carry.  This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed.  
-
-If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$.  Step
-10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed.
-
-\index{bn\_s\_mp\_sub.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */
-018   int
-019   s_mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     olduse, res, min, max;
-022   
-023     /* find sizes */
-024     min = b->used;
-025     max = a->used;
-026   
-027     /* init result */
-028     if (c->alloc < max) \{
-029       if ((res = mp_grow (c, max)) != MP_OKAY) \{
-030         return res;
-031       \}
-032     \}
-033     olduse = c->used;
-034     c->used = max;
-035   
-036     \{
-037       register mp_digit u, *tmpa, *tmpb, *tmpc;
-038       register int i;
-039   
-040       /* alias for digit pointers */
-041       tmpa = a->dp;
-042       tmpb = b->dp;
-043       tmpc = c->dp;
-044   
-045       /* set carry to zero */
-046       u = 0;
-047       for (i = 0; i < min; i++) \{
-048         /* T[i] = A[i] - B[i] - U */
-049         *tmpc = *tmpa++ - *tmpb++ - u;
-050   
-051         /* U = carry bit of T[i]
-052          * Note this saves performing an AND operation since
-053          * if a carry does occur it will propagate all the way to the
-054          * MSB.  As a result a single shift is enough to get the carry
-055          */
-056         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-057   
-058         /* Clear carry from T[i] */
-059         *tmpc++ &= MP_MASK;
-060       \}
-061   
-062       /* now copy higher words if any, e.g. if A has more digits than B  */
-063       for (; i < max; i++) \{
-064         /* T[i] = A[i] - U */
-065         *tmpc = *tmpa++ - u;
-066   
-067         /* U = carry bit of T[i] */
-068         u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1));
-069   
-070         /* Clear carry from T[i] */
-071         *tmpc++ &= MP_MASK;
-072       \}
-073   
-074       /* clear digits above used (since we may not have grown result above) */
-      
-075       for (i = c->used; i < olduse; i++) \{
-076         *tmpc++ = 0;
-077       \}
-078     \}
-079   
-080     mp_clamp (c);
-081     return MP_OKAY;
-082   \}
-083   
-\end{alltt}
-\end{small}
-
-Line 24 and 25 perform the initial hardcoded sorting of the inputs.  In reality the $min$ and $max$ variables are only aliases and are only 
-used to make the source code easier to read.  Again the pointer alias optimization is used within this algorithm.  Lines 41, 42 and 43 initialize the aliases for 
-$a$, $b$ and $c$ respectively.
-
-The first subtraction loop occurs on lines 46 through 60.  The theory behind the subtraction loop is exactly the same as that for
-the addition loop.  As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry 
-(\textit{see line 56}).  The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND 
-the least significant bit.  The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry
-occurs from subtraction.  This carry extraction requires two relatively cheap operations to extract the carry.  The other method is to simply 
-shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation.  This optimization only works on
-twos compliment machines which is a safe assumption to make.
-
-If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through
-$a$ and copy the result to $c$.  
-
-\subsection{High Level Addition}
-Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be
-established.  This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data 
-types.  
-
-Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} 
-flag.  A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases.
-
-\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_add}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed addition $c = a + b$. \\
-\hline \\
-1.  if $a.sign = b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$  \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag})  \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow b.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert a \vert - \vert b \vert$ \\
-3.  If any of the lower level operations failed return(\textit{MP\_MEM}) \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_add}
-\end{figure}
-
-\textbf{Algorithm mp\_add.}
-This algorithm performs the signed addition of two mp\_int variables.  There is no reference algorithm to draw upon from either \cite{TAOCPV2} or 
-\cite{HAC} since they both only provide unsigned operations.  The algorithm is fairly straightforward but restricted since subtraction can only 
-produce positive results.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&&\\
-
-\hline $+$ & $-$ & No  & $c = b - a$ & $b.sign$ \\
-\hline $-$ & $+$ & No  & $c = b - a$ & $b.sign$ \\
-
-\hline &&&&\\
-
-\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Addition Guide Chart}
-\label{fig:AddChart}
-\end{figure}
-
-Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled.  The 
-return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors.  This simplifies the description
-of the algorithm considerably and best follows how the implementation actually was achieved.
-
-Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed.  Recall from the descriptions of algorithms
-s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits.  The mp\_clamp algorithm will set the \textbf{sign}
-to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero.  
-
-For example, consider performing $-a + a$ with algorithm mp\_add.  By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would
-produce a result of $-0$.  However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp 
-within algorithm s\_mp\_add will force $-0$ to become $0$.  
-
-\index{bn\_mp\_add.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* high level addition (handles signs) */
-018   int
-019   mp_add (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     sa, sb, res;
-022   
-023     /* get sign of both inputs */
-024     sa = a->sign;
-025     sb = b->sign;
-026   
-027     /* handle two cases, not four */
-028     if (sa == sb) \{
-029       /* both positive or both negative */
-030       /* add their magnitudes, copy the sign */
-031       c->sign = sa;
-032       res = s_mp_add (a, b, c);
-033     \} else \{
-034       /* one positive, the other negative */
-035       /* subtract the one with the greater magnitude from */
-036       /* the one of the lesser magnitude.  The result gets */
-037       /* the sign of the one with the greater magnitude. */
-038       if (mp_cmp_mag (a, b) == MP_LT) \{
-039         c->sign = sb;
-040         res = s_mp_sub (b, a, c);
-041       \} else \{
-042         c->sign = sa;
-043         res = s_mp_sub (a, b, c);
-044       \}
-045     \}
-046     return res;
-047   \}
-048   
-\end{alltt}
-\end{small}
-
-The source code follows the algorithm fairly closely.  The most notable new source code addition is the usage of the $res$ integer variable which
-is used to pass result of the unsigned operations forward.  Unlike in the algorithm, the variable $res$ is merely returned as is without
-explicitly checking it and returning the constant \textbf{MP\_OKAY}.  The observation is this algorithm will succeed or fail only if the lower
-level functions do so.  Returning their return code is sufficient.
-
-\subsection{High Level Subtraction}
-The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm.  
-
-\newpage\begin{figure}[!here]
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sub}. \\
-\textbf{Input}.   Two mp\_ints $a$ and $b$  \\
-\textbf{Output}.  The signed subtraction $c = a - b$. \\
-\hline \\
-1.  if $a.sign \ne b.sign$ then do \\
-\hspace{3mm}1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{3mm}1.2  $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\
-2.  else do \\
-\hspace{3mm}2.1  if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\
-\hspace{6mm}2.1.1  $c.sign \leftarrow a.sign$ \\
-\hspace{6mm}2.1.2  $c \leftarrow \vert a \vert  - \vert b \vert$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c.sign \leftarrow  \left \lbrace \begin{array}{ll}
-                              MP\_ZPOS &  \mbox{if }a.sign = MP\_NEG \\
-                              MP\_NEG  &  \mbox{otherwise} \\
-                              \end{array} \right .$ \\
-\hspace{6mm}2.2.2  $c \leftarrow \vert b \vert  - \vert a \vert$ \\
-3.  If any of the lower level operations failed return(\textit{MP\_MEM}). \\
-4.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Algorithm mp\_sub}
-\end{figure}
-
-\textbf{Algorithm mp\_sub.}
-This algorithm performs the signed subtraction of two inputs.  Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or 
-\cite{HAC}.  Also this algorithm is restricted by algorithm s\_mp\_sub.  The following chart lists the eight possible inputs and
-the operations required.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|}
-\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\
-\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $+$ & $-$ & No  & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\
-\hline $-$ & $+$ & No  & $c = a + b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\
-\hline &&&& \\
-\hline $+$ & $+$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline $-$ & $-$ & No  & $c = b - a$ & $\mbox{opposite of }a.sign$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Subtraction Guide Chart}
-\end{figure}
-
-Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction.  That is to prevent the 
-algorithm from producing $-a - -a = -0$ as a result.  
-
-\index{bn\_mp\_sub.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* high level subtraction (handles signs) */
-018   int
-019   mp_sub (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     sa, sb, res;
-022   
-023     sa = a->sign;
-024     sb = b->sign;
-025   
-026     if (sa != sb) \{
-027       /* subtract a negative from a positive, OR */
-028       /* subtract a positive from a negative. */
-029       /* In either case, ADD their magnitudes, */
-030       /* and use the sign of the first number. */
-031       c->sign = sa;
-032       res = s_mp_add (a, b, c);
-033     \} else \{
-034       /* subtract a positive from a positive, OR */
-035       /* subtract a negative from a negative. */
-036       /* First, take the difference between their */
-037       /* magnitudes, then... */
-038       if (mp_cmp_mag (a, b) != MP_LT) \{
-039         /* Copy the sign from the first */
-040         c->sign = sa;
-041         /* The first has a larger or equal magnitude */
-042         res = s_mp_sub (a, b, c);
-043       \} else \{
-044         /* The result has the *opposite* sign from */
-045         /* the first number. */
-046         c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS;
-047         /* The second has a larger magnitude */
-048         res = s_mp_sub (b, a, c);
-049       \}
-050     \}
-051     return res;
-052   \}
-053   
-\end{alltt}
-\end{small}
-
-Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations
-and forward it to the end of the function.  On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a 
-``greater than or equal to'' comparison.  
-
-\section{Bit and Digit Shifting}
-It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$.  
-This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring.  
-
-In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established.  That is to shift
-the digits left or right as well to shift individual bits of the digits left and right.  It is important to note that not all ``shift'' operations
-are on radix-$\beta$ digits.  
-
-\subsection{Multiplication by Two}
-
-In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient 
-operation to perform.  A single precision logical shift left is sufficient to multiply a single digit by two.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = 2a$. \\
-\hline \\
-1.  If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $r \ne 0$ then do \\
-\hspace{3mm}7.1  $b_{n + 1} \leftarrow r$ \\
-\hspace{3mm}7.2  $b.used \leftarrow b.used + 1$ \\
-8.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}8.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}8.1.1  $b_n \leftarrow 0$ \\
-9.  $b.sign \leftarrow a.sign$ \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2.}
-This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two.  Neither \cite{TAOCPV2} nor \cite{HAC} describe such 
-an algorithm despite the fact it arises often in other algorithms.  The algorithm is setup much like the lower level algorithm s\_mp\_add since 
-it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$.  
-
-Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result.  The initial \textbf{used} count
-is set to $a.used$ at step 4.  Only if there is a final carry will the \textbf{used} count require adjustment.
-
-Step 6 is an optimization implementation of the addition loop for this specific case.  That is since the two values being added together 
-are the same there is no need to perform two reads from the digits of $a$.  Step 6.1 performs a single precision shift on the current digit $a_n$ to
-obtain what will be the carry for the next iteration.  Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus
-the previous carry.  Recall from section 5.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$.  An iteration of the addition loop is finished with 
-forwarding the carry to the next iteration.
-
-Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$.  
-Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$.
-
-\index{bn\_mp\_mul\_2.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* b = a*2 */
-018   int
-019   mp_mul_2 (mp_int * a, mp_int * b)
-020   \{
-021     int     x, res, oldused;
-022   
-023     /* grow to accomodate result */
-024     if (b->alloc < a->used + 1) \{
-025       if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{
-026         return res;
-027       \}
-028     \}
-029   
-030     oldused = b->used;
-031     b->used = a->used;
-032   
-033     \{
-034       register mp_digit r, rr, *tmpa, *tmpb;
-035   
-036       /* alias for source */
-037       tmpa = a->dp;
-038       
-039       /* alias for dest */
-040       tmpb = b->dp;
-041   
-042       /* carry */
-043       r = 0;
-044       for (x = 0; x < a->used; x++) \{
-045       
-046         /* get what will be the *next* carry bit from the 
-047          * MSB of the current digit 
-048          */
-049         rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1));
-050         
-051         /* now shift up this digit, add in the carry [from the previous] */
-052         *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK;
-053         
-054         /* copy the carry that would be from the source 
-055          * digit into the next iteration 
-056          */
-057         r = rr;
-058       \}
-059   
-060       /* new leading digit? */
-061       if (r != 0) \{
-062         /* add a MSB which is always 1 at this point */
-063         *tmpb = 1;
-064         ++b->used;
-065       \}
-066   
-067       /* now zero any excess digits on the destination 
-068        * that we didn't write to 
-069        */
-070       tmpb = b->dp + b->used;
-071       for (x = b->used; x < oldused; x++) \{
-072         *tmpb++ = 0;
-073       \}
-074     \}
-075     b->sign = a->sign;
-076     return MP_OKAY;
-077   \}
-\end{alltt}
-\end{small}
-
-This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input.  The only noteworthy difference
-is the use of the logical shift operator on line 52 to perform a single precision doubling.  
-
-\subsection{Division by Two}
-A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2}. \\
-\textbf{Input}.   One mp\_int $a$ \\
-\textbf{Output}.  $b = a/2$. \\
-\hline \\
-1.  If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits.  (\textit{mp\_grow}) \\
-2.  If the reallocation failed return(\textit{MP\_MEM}). \\
-3.  $oldused \leftarrow b.used$ \\
-4.  $b.used \leftarrow a.used$ \\
-5.  $r \leftarrow 0$ \\
-6.  for $n$ from $b.used - 1$ to $0$ do \\
-\hspace{3mm}6.1  $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\
-\hspace{3mm}6.2  $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}6.3  $r \leftarrow rr$ \\
-7.  If $b.used < oldused - 1$ then do \\
-\hspace{3mm}7.1  for $n$ from $b.used$ to $oldused - 1$ do \\
-\hspace{6mm}7.1.1  $b_n \leftarrow 0$ \\
-8.  $b.sign \leftarrow a.sign$ \\
-9.  Clamp excess digits of $b$.  (\textit{mp\_clamp}) \\
-10.  Return(\textit{MP\_OKAY}).\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2.}
-This algorithm will divide an mp\_int by two using logical shifts to the right.  Like mp\_mul\_2 it uses a modified low level addition
-core as the basis of the algorithm.  Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit.  The algorithm
-could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent
-reading past the end of the array of digits.
-
-Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the 
-least significant bit not the most significant bit.  
-
-\index{bn\_mp\_div\_2.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* b = a/2 */
-018   int
-019   mp_div_2 (mp_int * a, mp_int * b)
-020   \{
-021     int     x, res, oldused;
-022   
-023     /* copy */
-024     if (b->alloc < a->used) \{
-025       if ((res = mp_grow (b, a->used)) != MP_OKAY) \{
-026         return res;
-027       \}
-028     \}
-029   
-030     oldused = b->used;
-031     b->used = a->used;
-032     \{
-033       register mp_digit r, rr, *tmpa, *tmpb;
-034   
-035       /* source alias */
-036       tmpa = a->dp + b->used - 1;
-037   
-038       /* dest alias */
-039       tmpb = b->dp + b->used - 1;
-040   
-041       /* carry */
-042       r = 0;
-043       for (x = b->used - 1; x >= 0; x--) \{
-044         /* get the carry for the next iteration */
-045         rr = *tmpa & 1;
-046   
-047         /* shift the current digit, add in carry and store */
-048         *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1));
-049   
-050         /* forward carry to next iteration */
-051         r = rr;
-052       \}
-053   
-054       /* zero excess digits */
-055       tmpb = b->dp + b->used;
-056       for (x = b->used; x < oldused; x++) \{
-057         *tmpb++ = 0;
-058       \}
-059     \}
-060     b->sign = a->sign;
-061     mp_clamp (b);
-062     return MP_OKAY;
-063   \}
-\end{alltt}
-\end{small}
-
-\section{Polynomial Basis Operations}
-Recall from section 5.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$.  Such a representation is also known as
-the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single 
-place.  The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer
-division and Karatsuba multiplication.  
-
-Converting from an array of digits to polynomial basis is very simple.  Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that
-$y = \sum_{i=0}^{2} a_i \beta^i$.  Simply replace $\beta$ with $x$ and the expression is in polynomial basis.  For example, $f(x) = 8x + 9$ is the
-polynomial basis representation for $89$ using radix ten.  That is, $f(10) = 8(10) + 9 = 89$.  
-
-\subsection{Multiplication by $x$}
-
-Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one 
-degree.  In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$.  From a scalar basis point of view multiplying by $x$ is equivalent to
-multiplying by the integer $\beta$.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_lshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return(\textit{MP\_OKAY}). \\
-2.  If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits.  (\textit{mp\_grow}). \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  $a.used \leftarrow a.used + b$ \\
-5.  $i \leftarrow a.used - 1$ \\
-6.  $j \leftarrow a.used - 1 - b$ \\
-7.  for $n$ from $a.used - 1$ to $b$ do \\
-\hspace{3mm}7.1  $a_{i} \leftarrow a_{j}$ \\
-\hspace{3mm}7.2  $i \leftarrow i - 1$ \\
-\hspace{3mm}7.3  $j \leftarrow j - 1$ \\
-8.  for $n$ from 0 to $b - 1$ do \\
-\hspace{3mm}8.1  $a_n \leftarrow 0$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_lshd}
-\end{figure}
-
-\textbf{Algorithm mp\_lshd.}
-This algorithm multiplies an mp\_int by the $b$'th power of $x$.  This is equivalent to multiplying by $\beta^b$.  The algorithm differs 
-from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location.  The
-motivation behind this change is due to the way this function is typically used.  Algorithms such as mp\_add store the result in an optionally
-different third mp\_int because the original inputs are often still required.  Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is
-typically used on values where the original value is no longer required.  The algorithm will return success immediately if 
-$b \le 0$ since the rest of algorithm is only valid when $b > 0$.  
-
-First the destination $a$ is grown as required to accomodate the result.  The counters $i$ and $j$ are used to form a \textit{sliding window} over
-the digits of $a$ of length $b$.  The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}).  
-The loop on step 7 copies the digit from the tail to the head.  In each iteration the window is moved down one digit.   The last loop on 
-step 8 sets the lower $b$ digits to zero.
-
-\newpage
-\begin{center}
-\begin{figure}[here]
-\includegraphics{pics/sliding_window}
-\caption{Sliding Window Movement}
-\end{figure}
-\end{center}
-
-\index{bn\_mp\_lshd.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* shift left a certain amount of digits */
-018   int
-019   mp_lshd (mp_int * a, int b)
-020   \{
-021     int     x, res;
-022   
-023     /* if its less than zero return */
-024     if (b <= 0) \{
-025       return MP_OKAY;
-026     \}
-027   
-028     /* grow to fit the new digits */
-029     if (a->alloc < a->used + b) \{
-030        if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{
-031          return res;
-032        \}
-033     \}
-034   
-035     \{
-036       register mp_digit *top, *bottom;
-037   
-038       /* increment the used by the shift amount then copy upwards */
-039       a->used += b;
-040   
-041       /* top */
-042       top = a->dp + a->used - 1;
-043   
-044       /* base */
-045       bottom = a->dp + a->used - 1 - b;
-046   
-047       /* much like mp_rshd this is implemented using a sliding window
-048        * except the window goes the otherway around.  Copying from
-049        * the bottom to the top.  see bn_mp_rshd.c for more info.
-050        */
-051       for (x = a->used - 1; x >= b; x--) \{
-052         *top-- = *bottom--;
-053       \}
-054   
-055       /* zero the lower digits */
-056       top = a->dp;
-057       for (x = 0; x < b; x++) \{
-058         *top++ = 0;
-059       \}
-060     \}
-061     return MP_OKAY;
-062   \}
-\end{alltt}
-\end{small}
-
-The if statement on line 24 ensures that the $b$ variable is greater than zero.  The \textbf{used} count is incremented by $b$ before
-the copy loop begins.  This elminates the need for an additional variable in the for loop.  The variable $top$ on line 42 is an alias
-for the leading digit while $bottom$ on line 45 is an alias for the trailing edge.  The aliases form a window of exactly $b$ digits
-over the input.  
-
-\subsection{Division by $x$}
-
-Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_rshd}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\
-\hline \\
-1.  If $b \le 0$ then return. \\
-2.  If $a.used \le b$ then do \\
-\hspace{3mm}2.1  Zero $a$.  (\textit{mp\_zero}). \\
-\hspace{3mm}2.2  Return. \\
-3.  $i \leftarrow 0$ \\
-4.  $j \leftarrow b$ \\
-5.  for $n$ from 0 to $a.used - b - 1$ do \\
-\hspace{3mm}5.1  $a_i \leftarrow a_j$ \\
-\hspace{3mm}5.2  $i \leftarrow i + 1$ \\
-\hspace{3mm}5.3  $j \leftarrow j + 1$ \\
-6.  for $n$ from $a.used - b$ to $a.used - 1$ do \\
-\hspace{3mm}6.1  $a_n \leftarrow 0$ \\
-7.  $a.used \leftarrow a.used - b$ \\
-8.  Return. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_rshd}
-\end{figure}
-
-\textbf{Algorithm mp\_rshd.}
-This algorithm divides the input in place by the $b$'th power of $x$.  It is analogous to dividing by a $\beta^b$ but much quicker since
-it does not require single precision division.  This algorithm does not actually return an error code as it cannot fail.  
-
-If the input $b$ is less than one the algorithm quickly returns without performing any work.  If the \textbf{used} count is less than or equal
-to the shift count $b$ then it will simply zero the input and return.
-
-After the trivial cases of inputs have been handled the sliding window is setup.  Much like the case of algorithm mp\_lshd a sliding window that
-is $b$ digits wide is used to copy the digits.  Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit.  
-Also the digits are copied from the leading to the trailing edge.
-
-Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented.
-
-\index{bn\_mp\_rshd.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* shift right a certain amount of digits */
-018   void
-019   mp_rshd (mp_int * a, int b)
-020   \{
-021     int     x;
-022   
-023     /* if b <= 0 then ignore it */
-024     if (b <= 0) \{
-025       return;
-026     \}
-027   
-028     /* if b > used then simply zero it and return */
-029     if (a->used <= b) \{
-030       mp_zero (a);
-031       return;
-032     \}
-033   
-034     \{
-035       register mp_digit *bottom, *top;
-036   
-037       /* shift the digits down */
-038   
-039       /* bottom */
-040       bottom = a->dp;
-041   
-042       /* top [offset into digits] */
-043       top = a->dp + b;
-044   
-045       /* this is implemented as a sliding window where 
-046        * the window is b-digits long and digits from 
-047        * the top of the window are copied to the bottom
-048        *
-049        * e.g.
-050   
-051        b-2 | b-1 | b0 | b1 | b2 | ... | bb |   ---->
-052                    /\symbol{92}                   |      ---->
-053                     \symbol{92}-------------------/      ---->
-054        */
-055       for (x = 0; x < (a->used - b); x++) \{
-056         *bottom++ = *top++;
-057       \}
-058   
-059       /* zero the top digits */
-060       for (; x < a->used; x++) \{
-061         *bottom++ = 0;
-062       \}
-063     \}
-064     
-065     /* remove excess digits */
-066     a->used -= b;
-067   \}
-\end{alltt}
-\end{small}
-
-The only noteworthy element of this routine is the lack of a return type.  
-
--- Will update later to give it a return type...Tom
-
-\section{Powers of Two}
-
-Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required.  For 
-example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful.  Instead of performing single
-shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed.  
-
-\subsection{Multiplication by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot 2^b$. \\
-\hline \\
-1.  $c \leftarrow a$.  (\textit{mp\_copy}) \\
-2.  If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\
-3.  If the reallocation failed return(\textit{MP\_MEM}). \\
-4.  If $b \ge lg(\beta)$ then \\
-\hspace{3mm}4.1  $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\
-\hspace{3mm}4.2  If step 4.1 failed return(\textit{MP\_MEM}). \\
-5.  $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $d \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^d$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $0$ to $c.used - 1$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-\hspace{3mm}6.4  If $r > 0$ then do \\
-\hspace{6mm}6.4.1  $c_{c.used} \leftarrow r$ \\
-\hspace{6mm}6.4.2  $c.used \leftarrow c.used + 1$ \\
-7.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mul\_2d.}
-This algorithm multiplies $a$ by $2^b$ and stores the result in $c$.  The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to
-quickly compute the product.
-
-First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than 
-$\beta$.  For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ 
-left.
-
-After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform.  Step 5 calculates the number of remaining shifts 
-required.  If it is non-zero a modified shift loop is used to calculate the remaining product.  
-Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$.  The $mask$
-variable is used to extract the upper $d$ bits to form the carry for the next iteration.  
-
-This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to 
-complete.  It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow.
-
-\index{bn\_mp\_mul\_2d.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* NOTE:  This routine requires updating.  For instance the c->used = c->all
-      oc bit
-018      is wrong.  We should just shift c->used digits then set the carry as c->d
-      p[c->used] = carry
-019    
-020      To be fixed for LTM 0.18
-021    */
-022   
-023   /* shift left by a certain bit count */
-024   int
-025   mp_mul_2d (mp_int * a, int b, mp_int * c)
-026   \{
-027     mp_digit d;
-028     int      res;
-029   
-030     /* copy */
-031     if (a != c) \{
-032        if ((res = mp_copy (a, c)) != MP_OKAY) \{
-033          return res;
-034        \}
-035     \}
-036   
-037     if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) \{
-038        if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) \{
-039          return res;
-040        \}
-041     \}
-042   
-043     /* shift by as many digits in the bit count */
-044     if (b >= (int)DIGIT_BIT) \{
-045       if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{
-046         return res;
-047       \}
-048     \}
-049     c->used = c->alloc;
-050   
-051     /* shift any bit count < DIGIT_BIT */
-052     d = (mp_digit) (b % DIGIT_BIT);
-053     if (d != 0) \{
-054       register mp_digit *tmpc, mask, r, rr;
-055       register int x;
-056   
-057       /* bitmask for carries */
-058       mask = (((mp_digit)1) << d) - 1;
-059   
-060       /* alias */
-061       tmpc = c->dp;
-062   
-063       /* carry */
-064       r    = 0;
-065       for (x = 0; x < c->used; x++) \{
-066         /* get the higher bits of the current word */
-067         rr = (*tmpc >> (DIGIT_BIT - d)) & mask;
-068   
-069         /* shift the current word and OR in the carry */
-070         *tmpc = ((*tmpc << d) | r) & MP_MASK;
-071         ++tmpc;
-072   
-073         /* set the carry to the carry bits of the current word */
-074         r = rr;
-075       \}
-076     \}
-077     mp_clamp (c);
-078     return MP_OKAY;
-079   \}
-\end{alltt}
-\end{small}
-
-Notes to be revised when code is updated. -- Tom
-
-\subsection{Division by Power of Two}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_div\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}1.2  $d \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.3  Return(\textit{MP\_OKAY}). \\
-2.  $c \leftarrow a$ \\
-3.  $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-4.  If $b \ge lg(\beta)$ then do \\
-\hspace{3mm}4.1  $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\
-5.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-6.  If $k \ne 0$ then do \\
-\hspace{3mm}6.1  $mask \leftarrow 2^k$ \\
-\hspace{3mm}6.2  $r \leftarrow 0$ \\
-\hspace{3mm}6.3  for $n$ from $c.used - 1$ to $0$ do \\
-\hspace{6mm}6.3.1  $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\
-\hspace{6mm}6.3.2  $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\
-\hspace{6mm}6.3.3  $r \leftarrow rr$ \\
-7.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_div\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_div\_2d.}
-This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder.  The algorithm is designed much like algorithm 
-mp\_mul\_2d by first using whole digit shifts then single precision shifts.  This algorithm will also produce the remainder of the division
-by using algorithm mp\_mod\_2d.
-
-\index{bn\_mp\_div\_2d.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* shift right by a certain bit count (store quotient in c, optional remaind
-      er in d) */
-018   int
-019   mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d)
-020   \{
-021     mp_digit D, r, rr;
-022     int     x, res;
-023     mp_int  t;
-024   
-025   
-026     /* if the shift count is <= 0 then we do no work */
-027     if (b <= 0) \{
-028       res = mp_copy (a, c);
-029       if (d != NULL) \{
-030         mp_zero (d);
-031       \}
-032       return res;
-033     \}
-034   
-035     if ((res = mp_init (&t)) != MP_OKAY) \{
-036       return res;
-037     \}
-038   
-039     /* get the remainder */
-040     if (d != NULL) \{
-041       if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{
-042         mp_clear (&t);
-043         return res;
-044       \}
-045     \}
-046   
-047     /* copy */
-048     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-049       mp_clear (&t);
-050       return res;
-051     \}
-052   
-053     /* shift by as many digits in the bit count */
-054     if (b >= (int)DIGIT_BIT) \{
-055       mp_rshd (c, b / DIGIT_BIT);
-056     \}
-057   
-058     /* shift any bit count < DIGIT_BIT */
-059     D = (mp_digit) (b % DIGIT_BIT);
-060     if (D != 0) \{
-061       register mp_digit *tmpc, mask;
-062   
-063       /* mask */
-064       mask = (((mp_digit)1) << D) - 1;
-065   
-066       /* alias */
-067       tmpc = c->dp + (c->used - 1);
-068   
-069       /* carry */
-070       r = 0;
-071       for (x = c->used - 1; x >= 0; x--) \{
-072         /* get the lower  bits of this word in a temp */
-073         rr = *tmpc & mask;
-074   
-075         /* shift the current word and mix in the carry bits from the previous 
-      word */
-076         *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D));
-077         --tmpc;
-078   
-079         /* set the carry to the carry bits of the current word found above */
-080         r = rr;
-081       \}
-082     \}
-083     mp_clamp (c);
-084     if (d != NULL) \{
-085       mp_exch (&t, d);
-086     \}
-087     mp_clear (&t);
-088     return MP_OKAY;
-089   \}
-\end{alltt}
-\end{small}
-
-The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies.  The remainder $d$ may be optionally 
-ignored by passing \textbf{NULL} as the pointer to the mp\_int variable.    The temporary mp\_int variable $t$ is used to hold the 
-result of the remainder operation until the end.  This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before
-the quotient is obtained.
-
-The remainder of the source code is essentially the same as the source code for mp\_mul\_2d.  (-- Fix this paragraph up later, Tom).
-
-\subsection{Remainder of Division by Power of Two}
-
-The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$.  This
-algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mod\_2d}. \\
-\textbf{Input}.   One mp\_int $a$ and an integer $b$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\
-\hline \\
-1.  If $b \le 0$ then do \\
-\hspace{3mm}1.1  $c \leftarrow 0$ (\textit{mp\_zero}) \\
-\hspace{3mm}1.2  Return(\textit{MP\_OKAY}). \\
-2.  If $b > a.used \cdot lg(\beta)$ then do \\
-\hspace{3mm}2.1  $c \leftarrow a$ (\textit{mp\_copy}) \\
-\hspace{3mm}2.2  Return the result of step 2.1. \\
-3.  $c \leftarrow a$ \\
-4.  If step 3 failed return(\textit{MP\_MEM}). \\
-5.  for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\
-\hspace{3mm}5.1  $c_n \leftarrow 0$ \\
-6.  $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\
-7.  $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\
-8.  Clamp excess digits of $c$.  (\textit{mp\_clamp}) \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mod\_2d}
-\end{figure}
-
-\textbf{Algorithm mp\_mod\_2d.}
-This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$.  First if $b$ is less than or equal to zero the 
-result is set to zero.  If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns.  Otherwise, $a$ 
-is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count.
-
-\index{bn\_mp\_mod\_2d.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* calc a value mod 2\b */
-018   int
-019   mp_mod_2d (mp_int * a, int b, mp_int * c)
-020   \{
-021     int     x, res;
-022   
-023   
-024     /* if b is <= 0 then zero the int */
-025     if (b <= 0) \{
-026       mp_zero (c);
-027       return MP_OKAY;
-028     \}
-029   
-030     /* if the modulus is larger than the value than return */
-031     if (b > (int) (a->used * DIGIT_BIT)) \{
-032       res = mp_copy (a, c);
-033       return res;
-034     \}
-035   
-036     /* copy */
-037     if ((res = mp_copy (a, c)) != MP_OKAY) \{
-038       return res;
-039     \}
-040   
-041     /* zero digits above the last digit of the modulus */
-042     for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+
-      +) \{
-043       c->dp[x] = 0;
-044     \}
-045     /* clear the digit that is not completely outside/inside the modulus */
-046     c->dp[b / DIGIT_BIT] &=
-047       (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi
-      t) 1));
-048     mp_clamp (c);
-049     return MP_OKAY;
-050   \}
-\end{alltt}
-\end{small}
-
--- Add comments later, Tom.
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\
-                      & in $O(n)$ time. \\
-                      &\\
-$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming  \\
-                      & weight values such as $3$, $5$ and $9$.  Extend it to handle all values \\
-                      & upto $64$ with a hamming weight less than three. \\
-                      &\\
-$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\
-                      & $2^k - 1$ as well. \\
-                      &\\
-$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\
-                      & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\
-                      & any $n$-bit input.  Note that the time of addition is ignored in the \\
-                      & calculation.  \\
-                      & \\
-$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\
-                      & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$.  Again ignore \\
-                      & the cost of addition. \\
-                      & \\
-$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\
-                      & for $n = 64 \ldots 1024$ in steps of $64$. \\
-                      & \\
-$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\
-                      & calculating the result of a signed comparison. \\
-                      &
-\end{tabular}
-
-\chapter{Multiplication and Squaring}
-\section{The Multipliers}
-For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of 
-algorithms of any multiple precision integer package.  The set of multiplier algorithms include integer multiplication, squaring and modular reduction 
-where in each of the algorithms single precision multiplication is the dominant operation performed.  This chapter will discuss integer multiplication 
-and squaring, leaving modular reductions for the subsequent chapter.  
-
-The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular 
-exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$.  During a modular
-exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, 
-35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision 
-multiplications.
-
-For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied 
-against every digit of the other multiplicand.  Traditional long-hand multiplication is based on this process;  while the techniques can differ the 
-overall algorithm used is essentially the same.  Only ``recently'' have faster algorithms been studied.  First Karatsuba multiplication was discovered in 
-1962.  This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach.  
-This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions.  
-
-\section{Multiplication}
-\subsection{The Baseline Multiplication}
-\index{baseline multiplication}
-Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication
-algorithm that school children are taught.  The algorithm is considered an $O(n^2)$ algoritn since for two $n$-digit inputs $n^2$ single precision 
-multiplications are required.  More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required.  To 
-simplify most discussions, it will be assumed that the inputs have comparable number of digits.  
-
-The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be 
-used.  This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible.    One important 
-facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution.  The importance of this 
-modification will become evident during the discussion of Barrett modular reduction.  Recall that for a $n$ and $m$ digit input the product 
-will be at most $n + m$ digits.  Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product.  
-
-Recall from sub-section 5.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}.  We shall now extend the variable set to 
-include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}.  This implies that $2^{\alpha} > 2 \cdot \beta^2$.  The 
-constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 6.2.2 for more information}).
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-1.  If min$(a.used, b.used) < \delta$ then do \\
-\hspace{3mm}1.1  Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}).  \\
-\hspace{3mm}1.2  Return the result of step 1.1 \\
-\\
-Allocate and initialize a temporary mp\_int. \\
-2.  Init $t$ to be of size $digs$ \\
-3.  If step 2 failed return(\textit{MP\_MEM}). \\
-4.  $t.used \leftarrow digs$ \\
-\\
-Compute the product. \\
-5.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}5.1  $u \leftarrow 0$ \\
-\hspace{3mm}5.2  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}5.3  If $pb < 1$ then goto step 6. \\
-\hspace{3mm}5.4  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}5.4.1  $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\
-\hspace{6mm}5.4.2  $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.5  if $ix + pb < digs$ then do \\
-\hspace{6mm}5.5.1  $t_{ix + pb} \leftarrow u$ \\
-6.  Clamp excess digits of $t$. \\
-7.  Swap $c$ with $t$ \\
-8.  Clear $t$ \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_mul\_digs}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_mul\_digs.}
-This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits.  While it may seem
-a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent 
-algorithm.  The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}.  
-Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the 
-inputs.
-
-The first thing this algorithm checks for is whether a Comba multiplier can be used instead.   If the minimum digit count of either
-input is less than $\delta$, then the Comba method may be used instead.    After the Comba method is ruled out, the baseline algorithm begins.  A 
-temporary mp\_int variable $t$ is used to hold the intermediate result of the product.  This allows the algorithm to be used to 
-compute products when either $a = c$ or $b = c$ without overwriting the inputs.  
-
-All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output.  The $pb$ variable
-is given the count of digits to read from $b$ inside the nested loop.  If $pb \le 1$ then no more output digits can be produced and the algorithm
-will exit the loop.  The best way to think of the loops are as a series of $pb \times 1$ multiplications.    That is, in each pass of the 
-innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$.  
-
-For example, consider multiplying $576$ by $241$.  That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best
-visualized in the following table.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|l|}
-\hline   &&          & 5 & 7 & 6 & \\
-\hline   $\times$&&  & 2 & 4 & 1 & \\
-\hline &&&&&&\\
-  &&          & 5 & 7 & 6 & $10^0(1)(576)$ \\
-  &2 &   3    & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\
-  1 & 3 & 8 & 8 & 1 & 6 &   $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\
-\hline  
-\end{tabular}
-\end{center}
-\caption{Long-Hand Multiplication Diagram}
-\end{figure}
-
-Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate 
-count.  That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult.
-
-Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable.  The multiplication on that step
-is assumed to be a double wide output single precision multiplication.  That is, two single precision variables are multiplied to produce a
-double precision result.  The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step
-5.4.1 is propagated through the nested loop.  If the carry was not propagated immediately it would overflow the single precision digit 
-$t_{ix+iy}$ and the result would be lost.  
-
-At step 5.5 the nested loop is finished and any carry that was left over should be forwarded.  The carry does not have to be added to the $ix+pb$'th
-digit since that digit is assumed to be zero at this point.  However, if $ix + pb \ge digs$ the carry is not set as it would make the result
-exceed the precision requested.
-
-\index{bn\_s\_mp\_mul\_digs.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* multiplies |a| * |b| and only computes upto digs digits of result
-018    * HAC pp. 595, Algorithm 14.12  Modified so you can control how 
-019    * many digits of output are created.
-020    */
-021   int
-022   s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-023   \{
-024     mp_int  t;
-025     int     res, pa, pb, ix, iy;
-026     mp_digit u;
-027     mp_word r;
-028     mp_digit tmpx, *tmpt, *tmpy;
-029   
-030     /* can we use the fast multiplier? */
-031     if (((digs) < MP_WARRAY) &&
-032         MIN (a->used, b->used) < 
-033             (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_s_mp_mul_digs (a, b, c, digs);
-035     \}
-036   
-037     if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{
-038       return res;
-039     \}
-040     t.used = digs;
-041   
-042     /* compute the digits of the product directly */
-043     pa = a->used;
-044     for (ix = 0; ix < pa; ix++) \{
-045       /* set the carry to zero */
-046       u = 0;
-047   
-048       /* limit ourselves to making digs digits of output */
-049       pb = MIN (b->used, digs - ix);
-050   
-051       /* setup some aliases */
-052       /* copy of the digit from a used within the nested loop */
-053       tmpx = a->dp[ix];
-054       
-055       /* an alias for the destination shifted ix places */
-056       tmpt = t.dp + ix;
-057       
-058       /* an alias for the digits of b */
-059       tmpy = b->dp;
-060   
-061       /* compute the columns of the output and propagate the carry */
-062       for (iy = 0; iy < pb; iy++) \{
-063         /* compute the column as a mp_word */
-064         r = ((mp_word) *tmpt) + 
-065             ((mp_word) tmpx) * ((mp_word) * tmpy++) + 
-066             ((mp_word) u);
-067   
-068         /* the new column is the lower part of the result */
-069         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-070   
-071         /* get the carry word from the result */
-072         u = (mp_digit) (r >> ((mp_word) DIGIT_BIT));
-073       \}
-074       /* set carry if it is placed below digs */
-075       if (ix + iy < digs) \{
-076         *tmpt = u;
-077       \}
-078     \}
-079   
-080     mp_clamp (&t);
-081     mp_exch (&t, c);
-082   
-083     mp_clear (&t);
-084     return MP_OKAY;
-085   \}
-\end{alltt}
-\end{small}
-
-Lines 31 to 35 determine if the Comba method can be used first.  The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and
-the number of digits of output is less than \textbf{MP\_WARRAY}.  This new constant is used to control 
-the stack usage in the Comba routines.  By default it is set to $\delta$ but can be reduced when memory is at a premium.
-
-Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66.  Note how all of the
-variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$.  That is to ensure that double precision operations 
-are used instead of single precision.  The multiplication on line 65 makes use of a specific GCC optimizer behaviour.  On the outset it looks like 
-the compiler will have to use a double precision multiplication to produce the result required.  Such an operation would be horribly slow on most 
-processors and drag this to a crawl.  However, GCC is smart enough to realize that double wide output single precision multipliers can be used.  For 
-example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result.  
-
-\subsection{Faster Multiplication by the ``Comba'' Method}
-
-One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards.  This
-makes the nested loop very sequential and hard to unroll and implement in parallel.  The ``Comba'' \cite{COMBA} method is named after little known 
-(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested 
-carry fixup operations.  As an interesting aside it seems that Paul Barrett describes a similar technique in
-his 1986 paper \cite{BARRETT} written five years before.
-
-At the heart of the Comba technique is once again the long-hand algorithm.  Except in this case a slight twist is placed on how
-the columns of the result are produced.  In the standard long-hand algorithm rows of products are produced then added together to form the 
-final result.  In the baseline algorithm the columns are added together after each iteration to get the result instantaneously.  
-
-In the Comba algorithm the columns of the result are produced entirely independently of each other.  That is at the $O(n^2)$ level a 
-simple multiplication and addition step is performed.  The carries of the columns are propagated after the nested loop to reduce the amount
-of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. 
-
-\begin{equation}
-\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace
-\end{equation}
-
-Where $\vec x_n$ is the $n'th$ column of the output vector.  Consider the following example which computes the vector $\vec x$ for the multiplication
-of $576$ and $241$.  
-
-\newpage\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|c|c|c|c|c|}
-  \hline &          & 5 & 7 & 6 & First Input\\
-  \hline $\times$ & & 2 & 4 & 1 & Second Input\\
-\hline            &                        & $1 \cdot 5 = 5$   & $1 \cdot 7 = 7$   & $1 \cdot 6 = 6$ & First pass \\
-                  &  $4 \cdot 5 = 20$      & $4 \cdot 7+5=33$  & $4 \cdot 6+7=31$  & 6               & Second pass \\
-   $2 \cdot 5 = 10$ &  $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31                & 6             & Third pass \\
-\hline 10 & 34 & 45 & 31 & 6 & Final Result \\   
-\hline   
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Comba Multiplication Diagram}
-\end{figure}
-
-At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler.  
-Now the columns must be fixed by propagating the carry upwards.  The resultant vector will have one extra dimension over the input vector which is
-congruent to adding a leading zero digit.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Comba Fixup}. \\
-\textbf{Input}.   Vector $\vec x$ of dimension $k$ \\
-\textbf{Output}.  Vector $\vec x$ such that the carries have been propagated. \\
-\hline \\
-1.  for $n$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\
-\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\
-2.  Return($\vec x$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Comba Fixup}
-\end{figure}
-
-With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$.  In this case 
-$241 \cdot 576$ is in fact $138816$ and the procedure succeeded.  If the algorithm is correct and as will be demonstrated shortly more
-efficient than the baseline algorithm why not simply always use this algorithm?
-
-\subsubsection{Column Weight.}
-At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output 
-independently.  A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix
-the carries.  For example, in the multiplication of two three-digit numbers the third column of output will be the sum of
-three single precision multiplications.  If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then
-an overflow can occur and the carry information will be lost.  For any $m$ and $n$ digit inputs the maximum weight of any column is 
-min$(m, n)$ which is fairly obvious.
-
-The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used.  Recall
-from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision.  Given these
-two quantities we must not violate the following
-
-\begin{equation}
-k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha}
-\end{equation}
-
-Which reduces to 
-
-\begin{equation}
-k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha}
-\end{equation}
-
-Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit.  By further re-arrangement of the equation the final solution is
-found.
-
-\begin{equation}
-k  < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}}
-\end{equation}
-
-The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$.  In this configuration 
-the smaller input may not have more than $256$ digits if the Comba method is to be used.  This is quite satisfactory for most applications since 
-$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and an integer $digs$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\
-1.  If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}).\\
-\\
-Zero the temporary array $\hat W$. \\
-3.  for $n$ from $0$ to $digs - 1$ do \\
-\hspace{3mm}3.1  $\hat W_n \leftarrow 0$ \\
-\\
-Compute the columns. \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}4.1  $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\
-\hspace{3mm}4.2  If $pb < 1$ then goto step 5. \\
-\hspace{3mm}4.3  for $iy$ from $0$ to $pb - 1$ do \\
-\hspace{6mm}4.3.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\
-\\
-Propagate the carries upwards. \\
-5.  $oldused \leftarrow c.used$ \\
-6.  $c.used \leftarrow digs$ \\
-7.  If $digs > 1$ then do \\
-\hspace{3mm}7.1.  for $ix$ from $1$ to $digs - 1$ do \\
-\hspace{6mm}7.1.1  $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\
-\hspace{6mm}7.1.2  $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\
-8.  else do \\
-\hspace{3mm}8.1  $ix \leftarrow 0$ \\
-9.  $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Zero excess digits. \\
-10.  If $digs < oldused$ then do \\
-\hspace{3mm}10.1  for $n$ from $digs$ to $oldused - 1$ do \\
-\hspace{6mm}10.1.1  $c_n \leftarrow 0$ \\
-11.  Clamp excessive digits of $c$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_mul\_digs}
-\label{fig:COMBAMULT}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_mul\_digs.}
-This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision.  The algorithm
-essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster.
-
-The array $\hat W$ is meant to be on the stack when the algorithm is used.  The size of the array does not change which is ideal.  Note also that 
-unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$.  
-
-The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm.  The lack of
-a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions.  Now that each
-iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism.
-
-To measure the benefits of the Comba method over the baseline method consider the number of operations that are required.  If the 
-cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require 
-$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers.  The Comba method requires only $O(pn^2 + qn)$ time, however in practice, 
-the speed increase is actually much more.  With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply
-and addition operations in the nested loop in parallel.  
-
-\index{bn\_fast\_s\_mp\_mul\_digs.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* Fast (comba) multiplier
-018    *
-019    * This is the fast column-array [comba] multiplier.  It is 
-020    * designed to compute the columns of the product first 
-021    * then handle the carries afterwards.  This has the effect 
-022    * of making the nested loops that compute the columns very
-023    * simple and schedulable on super-scalar processors.
-024    *
-025    * This has been modified to produce a variable number of 
-026    * digits of output so if say only a half-product is required 
-027    * you don't have to compute the upper half (a feature 
-028    * required for fast Barrett reduction).
-029    *
-030    * Based on Algorithm 14.12 on pp.595 of HAC.
-031    *
-032    */
-033   int
-034   fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs)
-035   \{
-036     int     olduse, res, pa, ix;
-037     mp_word W[MP_WARRAY];
-038   
-039     /* grow the destination as required */
-040     if (c->alloc < digs) \{
-041       if ((res = mp_grow (c, digs)) != MP_OKAY) \{
-042         return res;
-043       \}
-044     \}
-045   
-046     /* clear temp buf (the columns) */
-047     memset (W, 0, sizeof (mp_word) * digs);
-048   
-049     /* calculate the columns */
-050     pa = a->used;
-051     for (ix = 0; ix < pa; ix++) \{
-052       /* this multiplier has been modified to allow you to 
-053        * control how many digits of output are produced.  
-054        * So at most we want to make upto "digs" digits of output.
-055        *
-056        * this adds products to distinct columns (at ix+iy) of W
-057        * note that each step through the loop is not dependent on
-058        * the previous which means the compiler can easily unroll
-059        * the loop without scheduling problems
-060        */
-061       \{
-062         register mp_digit tmpx, *tmpy;
-063         register mp_word *_W;
-064         register int iy, pb;
-065   
-066         /* alias for the the word on the left e.g. A[ix] * A[iy] */
-067         tmpx = a->dp[ix];
-068   
-069         /* alias for the right side */
-070         tmpy = b->dp;
-071   
-072         /* alias for the columns, each step through the loop adds a new
-073            term to each column
-074          */
-075         _W = W + ix;
-076   
-077         /* the number of digits is limited by their placement.  E.g.
-078            we avoid multiplying digits that will end up above the # of
-079            digits of precision requested
-080          */
-081         pb = MIN (b->used, digs - ix);
-082   
-083         for (iy = 0; iy < pb; iy++) \{
-084           *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-085         \}
-086       \}
-087   
-088     \}
-089   
-090     /* setup dest */
-091     olduse = c->used;
-092     c->used = digs;
-093   
-094     \{
-095       register mp_digit *tmpc;
-096   
-097       /* At this point W[] contains the sums of each column.  To get the
-098        * correct result we must take the extra bits from each column and
-099        * carry them down
-100        *
-101        * Note that while this adds extra code to the multiplier it 
-102        * saves time since the carry propagation is removed from the 
-103        * above nested loop.This has the effect of reducing the work 
-104        * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the 
-105        * cost of the shifting.  On very small numbers this is slower 
-106        * but on most cryptographic size numbers it is faster.
-107        */
-108       tmpc = c->dp;
-109       for (ix = 1; ix < digs; ix++) \{
-110         W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-111         *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-112       \}
-113       *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK));
-114   
-115       /* clear unused */
-116       for (; ix < olduse; ix++) \{
-117         *tmpc++ = 0;
-118       \}
-119     \}
-120   
-121     mp_clamp (c);
-122     return MP_OKAY;
-123   \}
-\end{alltt}
-\end{small}
-
-The memset on line 47 clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication
-implementation a series of aliases (\textit{lines 67, 70 and 75}) are used to simplify the inner $O(n^2)$ loop.  
-In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass.  
-
-The inner loop on lines 83, 84 and 85 is where the algorithm will spend the majority of the time, which is why it has been 
-stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}.  On x86 processors the multiplication and additions amount to at the 
-very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three 
-(\textit{one load, one store, one multiply-add}).   For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop 
-and scheduling the instructions so there are very few dependency stalls.
-
-In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference.  However, in the $O(n^2)$ nested loop of the
-baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next 
-digit.  As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can
-be simultaneously used.  
-
-\subsection{Polynomial Basis Multiplication}
-To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication.  In the following algorithms
-the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and  
-$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required.  In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree.
- 
-The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$.  The coefficients $w_i$ will
-directly yield the desired product when $\beta$ is substituted for $x$.  The direct solution to solve for the $2n + 1$ coefficients
-requires $O(n^2)$ time and would in practice be slower than the Comba technique.
-
-However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown 
-coefficients.   This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with 
-Gaussian elimination.  This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in 
-effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$.  
-
-The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible.  However, since 
-$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place.  The benefit of this technique stems from the 
-fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively.  As a result finding the $2n + 1$ relations required 
-by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs.
-
-When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$.  The $\zeta_0$ term
-is simply the product $W(0) = w_0 = a_0 \cdot b_0$.  The $\zeta_1$ term is the product 
-$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$.  The third point $\zeta_{\infty}$ is less obvious but rather
-simple to explain.  The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication.  
-The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$.  Note that the 
-points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly.
-
-If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} 
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ for small values of $q$.  The term ``mirror point'' stems from the fact that 
-$\left (2^q \right )^{2n}  \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$.  For
-example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror.
-
-\begin{eqnarray}
-\zeta_{2}                  = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\
-16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0)
-\end{eqnarray}
-
-Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts.  For example, when $n = 2$ the
-polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$.  This technique of polynomial representation is known as Horner's method.  
-
-As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications.  Each multiplication is of 
-multiplicands that have $n$ times fewer digits than the inputs.  The asymptotic running time of this algorithm is 
-$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}).  Figure~\ref{fig:exponent}
-summarizes the exponents for various values of $n$.
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Split into $n$ Parts} & \textbf{Exponent}  & \textbf{Notes}\\
-\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\
-\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\
-\hline $4$ & $1.403677461$ &\\
-\hline $5$ & $1.365212389$ &\\
-\hline $10$ & $1.278753601$ &\\
-\hline $100$ & $1.149426538$ &\\
-\hline $1000$ & $1.100270931$ &\\
-\hline $10000$ & $1.075252070$ &\\
-\hline
-\end{tabular}
-\end{center}
-\caption{Asymptotic Running Time of Polynomial Basis Multiplication}
-\label{fig:exponent}
-\end{figure}
-
-At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$.  However, the overhead
-of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large
-numbers.  
-
-\subsubsection{Cutoff Point}
-The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach.  However, 
-the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved.  This makes the
-polynomial basis approach more costly to use with small inputs.
-
-Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}).  There exists a 
-point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and 
-when $m > y$ the Comba methods are slower than the polynomial basis algorithms.  
-
-The exact location of $y$ depends on several key architectural elements of the computer platform in question.
-
-\begin{enumerate}
-\item  The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc.  For example
-on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$.  The higher the ratio in favour of multiplication the lower
-the cutoff point $y$ will be.  
-
-\item  The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is.  Generally speaking as the number of splits
-grows the complexity grows substantially.  Ideally solving the system will only involve addition, subtraction and shifting of integers.  This
-directly reflects on the ratio previous mentioned.
-
-\item  To a lesser extent memory bandwidth and function call overheads.  Provided the values are in the processor cache this is less of an
-influence over the cutoff point.
-
-\end{enumerate}
-
-A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met.  For example, if the point
-is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster.  Finding the cutoff points is fairly simple when
-a high resolution timer is available.  
-
-\subsection{Karatsuba Multiplication}
-Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for
-general purpose multiplication.  Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with 
-light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent.
-
-\begin{equation}
-f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) + ac + bd)x + bd
-\end{equation}
-
-Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product.  Applying
-this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique.  It turns 
-out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points 
-$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$.  Consider the resultant system of equations.
-
-\begin{center}
-\begin{tabular}{rcrcrcrc}
-$\zeta_{0}$ &      $=$ &  &  &  & & $w_0$ \\
-$-\zeta_{-1}$ &    $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\
-$\zeta_{\infty}$ & $=$ & $w_2$ &  & &  & \\
-\end{tabular}
-\end{center}
-
-By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for.  The simplicity
-of this system of equations has made Karatsuba fairly popular.  In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.}
-making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman.  It is worth noting that the point 
-$\zeta_1$ could be substituted for $-\zeta_{-1}$.  In this case the first and third row are subtracted instead of added to the second row.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\
-\hline \\
-1.  Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\
-2.  If step 2 failed then return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1 \cdot \beta^B + x0$ \\
-3.  $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\
-6.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\
-7.  $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\
-\\
-Calculate the three products. \\
-8.  $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\
-9.  $x1y1 \leftarrow x1 \cdot y1$ \\
-10.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-11.  $x0 \leftarrow y1 - y0$ \\
-12.  $t1 \leftarrow t1 \cdot x0$ \\
-\\
-Calculate the middle term. \\
-13.  $x0 \leftarrow x0y0 + x1y1$ \\
-14.  $t1 \leftarrow x0 - t1$ \\
-\\
-Calculate the final product. \\
-15.  $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\
-16.  $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\
-17.  $t1 \leftarrow x0y0 + t1$ \\
-18.  $c \leftarrow t1 + x1y1$ \\
-19.  Clear all of the temporary variables. \\
-20.  Return(\textit{MP\_OKAY}).\\
-\hline 
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_mul.}
-This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm.  It is loosely based on the description
-from Knuth \cite[pp. 294-295]{TAOCPV2}.  
-
-\index{radix point}
-In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen.  The radix point chosen must
-be used for both of the inputs meaning that it must be smaller than the smallest input.  Step 3 chooses the radix point $B$ as half of the 
-smallest input \textbf{used} count.  After the radix point is chosen the inputs are split into lower and upper halves.  Step 4 and 5 
-compute the lower halves.  Step 6 and 7 computer the upper halves.  
-
-After the halves have been computed the three intermediate half-size products must be computed.  Step 8 and 9 compute the trivial products
-$x0 \cdot y0$ and $x1 \cdot y1$.  The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed.  By using $x0$ instead
-of an additional temporary variable, the algorithm can avoid an addition memory allocation operation.
-
-The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations.
-
-\index{bn\_mp\_karatsuba\_mul.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* c = |a| * |b| using Karatsuba Multiplication using 
-018    * three half size multiplications
-019    *
-020    * Let B represent the radix [e.g. 2**DIGIT_BIT] and 
-021    * let n represent half of the number of digits in 
-022    * the min(a,b)
-023    *
-024    * a = a1 * B**n + a0
-025    * b = b1 * B**n + b0
-026    *
-027    * Then, a * b => 
-028      a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0
-029    *
-030    * Note that a1b1 and a0b0 are used twice and only need to be 
-031    * computed once.  So in total three half size (half # of 
-032    * digit) multiplications are performed, a0b0, a1b1 and 
-033    * (a1-b1)(a0-b0)
-034    *
-035    * Note that a multiplication of half the digits requires
-036    * 1/4th the number of single precision multiplications so in 
-037    * total after one call 25% of the single precision multiplications 
-038    * are saved.  Note also that the call to mp_mul can end up back 
-039    * in this function if the a0, a1, b0, or b1 are above the threshold.  
-040    * This is known as divide-and-conquer and leads to the famous 
-041    * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than 
-042    * the standard O(N**2) that the baseline/comba methods use.  
-043    * Generally though the overhead of this method doesn't pay off 
-044    * until a certain size (N ~ 80) is reached.
-045    */
-046   int
-047   mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c)
-048   \{
-049     mp_int  x0, x1, y0, y1, t1, x0y0, x1y1;
-050     int     B, err;
-051   
-052     /* default the return code to an error */
-053     err = MP_MEM;
-054   
-055     /* min # of digits */
-056     B = MIN (a->used, b->used);
-057   
-058     /* now divide in two */
-059     B = B / 2;
-060   
-061     /* init copy all the temps */
-062     if (mp_init_size (&x0, B) != MP_OKAY)
-063       goto ERR;
-064     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-065       goto X0;
-066     if (mp_init_size (&y0, B) != MP_OKAY)
-067       goto X1;
-068     if (mp_init_size (&y1, b->used - B) != MP_OKAY)
-069       goto Y0;
-070   
-071     /* init temps */
-072     if (mp_init_size (&t1, B * 2) != MP_OKAY)
-073       goto Y1;
-074     if (mp_init_size (&x0y0, B * 2) != MP_OKAY)
-075       goto T1;
-076     if (mp_init_size (&x1y1, B * 2) != MP_OKAY)
-077       goto X0Y0;
-078   
-079     /* now shift the digits */
-080     x0.sign = x1.sign = a->sign;
-081     y0.sign = y1.sign = b->sign;
-082   
-083     x0.used = y0.used = B;
-084     x1.used = a->used - B;
-085     y1.used = b->used - B;
-086   
-087     \{
-088       register int x;
-089       register mp_digit *tmpa, *tmpb, *tmpx, *tmpy;
-090   
-091       /* we copy the digits directly instead of using higher level functions
-092        * since we also need to shift the digits
-093        */
-094       tmpa = a->dp;
-095       tmpb = b->dp;
-096   
-097       tmpx = x0.dp;
-098       tmpy = y0.dp;
-099       for (x = 0; x < B; x++) \{
-100         *tmpx++ = *tmpa++;
-101         *tmpy++ = *tmpb++;
-102       \}
-103   
-104       tmpx = x1.dp;
-105       for (x = B; x < a->used; x++) \{
-106         *tmpx++ = *tmpa++;
-107       \}
-108   
-109       tmpy = y1.dp;
-110       for (x = B; x < b->used; x++) \{
-111         *tmpy++ = *tmpb++;
-112       \}
-113     \}
-114   
-115     /* only need to clamp the lower words since by definition the 
-116      * upper words x1/y1 must have a known number of digits
-117      */
-118     mp_clamp (&x0);
-119     mp_clamp (&y0);
-120   
-121     /* now calc the products x0y0 and x1y1 */
-122     /* after this x0 is no longer required, free temp [x0==t2]! */
-123     if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY)  
-124       goto X1Y1;          /* x0y0 = x0*y0 */
-125     if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY)
-126       goto X1Y1;          /* x1y1 = x1*y1 */
-127   
-128     /* now calc x1-x0 and y1-y0 */
-129     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-130       goto X1Y1;          /* t1 = x1 - x0 */
-131     if (mp_sub (&y1, &y0, &x0) != MP_OKAY)
-132       goto X1Y1;          /* t2 = y1 - y0 */
-133     if (mp_mul (&t1, &x0, &t1) != MP_OKAY)
-134       goto X1Y1;          /* t1 = (x1 - x0) * (y1 - y0) */
-135   
-136     /* add x0y0 */
-137     if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY)
-138       goto X1Y1;          /* t2 = x0y0 + x1y1 */
-139     if (mp_sub (&x0, &t1, &t1) != MP_OKAY)
-140       goto X1Y1;          /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */
-141   
-142     /* shift by B */
-143     if (mp_lshd (&t1, B) != MP_OKAY)
-144       goto X1Y1;          /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<<B */
-145     if (mp_lshd (&x1y1, B * 2) != MP_OKAY)
-146       goto X1Y1;          /* x1y1 = x1y1 << 2*B */
-147   
-148     if (mp_add (&x0y0, &t1, &t1) != MP_OKAY)
-149       goto X1Y1;          /* t1 = x0y0 + t1 */
-150     if (mp_add (&t1, &x1y1, c) != MP_OKAY)
-151       goto X1Y1;          /* t1 = x0y0 + t1 + x1y1 */
-152   
-153     /* Algorithm succeeded set the return code to MP_OKAY */
-154     err = MP_OKAY;
-155   
-156   X1Y1:mp_clear (&x1y1);
-157   X0Y0:mp_clear (&x0y0);
-158   T1:mp_clear (&t1);
-159   Y1:mp_clear (&y1);
-160   Y0:mp_clear (&y0);
-161   X1:mp_clear (&x1);
-162   X0:mp_clear (&x0);
-163   ERR:
-164     return err;
-165   \}
-\end{alltt}
-\end{small}
-
-The new coding element in this routine, not  seen in previous routines, is the usage of goto statements.  The conventional
-wisdom is that goto statements should be avoided.  This is generally true, however when every single function call can fail, it makes sense
-to handle error recovery with a single piece of code.  Lines 62 to 76 handle initializing all of the temporary variables 
-required.  Note how each of the if statements goes to a different label in case of failure.  This allows the routine to correctly free only
-the temporaries that have been successfully allocated so far.
-
-The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large.  This saves the 
-additional reallocation that would have been necessary.  Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective
-number of digits for the next section of code.
-
-The first algebraic portion of the algorithm is to split the two inputs into their halves.  However, instead of using mp\_mod\_2d and mp\_rshd
-to extract the halves, the respective code has been placed inline within the body of the function.  To initialize the halves, the \textbf{used} and 
-\textbf{sign} members are copied first.  The first for loop on line 99 copies the lower halves.  Since they are both the same magnitude it 
-is simpler to calculate both lower halves in a single loop.  The for loop on lines 105 and 110 calculate the upper halves $x1$ and 
-$y1$ respectively.
-
-By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs.
-
-When line 154 is reached, the algorithm has completed succesfully.  The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that
-the same code that handles errors can be used to clear the temporary variables and return.  
-
-\subsection{Toom-Cook $3$-Way Multiplication}
-Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 3$ except that the points  are 
-chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce.  Here, the points $\zeta_{0}$, 
-$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients 
-of the $W(x)$.
-
-With the five relations that Toom-Cook specifies, the following system of equations is formed.
-
-\begin{center}
-\begin{tabular}{rcrcrcrcrcr}
-$\zeta_0$                    & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$  \\
-$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$  \\
-$\zeta_1$                    & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$  \\
-$\zeta_2$                    & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$  \\
-$\zeta_{\infty}$             & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$  \\
-\end{tabular}
-\end{center}
-
-A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power
-of two, two divisions by three and one multiplication by three.  All of these $19$ sub-operations require less than quadratic time, meaning that
-the algorithm can be faster than a baseline multiplication.  However, the greater complexity of this algorithm places the cutoff point
-(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow  a  \cdot  b $ \\
-\hline \\
-Split $a$ and $b$ into three pieces.  E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\
-1.  $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\
-2.  $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-3.  $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-4.  $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-5.  $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-6.  $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-7.  $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\
-\\
-Find the five equations for $w_0, w_1, ..., w_4$. \\
-8.  $w_0 \leftarrow a_0 \cdot b_0$ \\
-9.  $w_4 \leftarrow a_2 \cdot b_2$ \\
-10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\
-11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\
-13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\
-14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\
-15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\
-16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\
-17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\
-\\
-Continued on the next page.\\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot  b $ \\
-\hline \\
-Now solve the system of equations. \\
-18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\
-19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\
-20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\
-21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\
-23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\
-24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\
-25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\
-\\
-Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\
-26. for $n$ from $1$ to $4$ do \\
-\hspace{3mm}26.1  $w_n \leftarrow w_n \cdot \beta^{nk}$ \\
-27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\
-28. Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_toom\_mul (continued)}
-\end{figure}
-
-\textbf{Algorithm mp\_toom\_mul.}
-This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach.  Compared to the Karatsuba multiplication, this 
-algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead.  In this
-description, several statements have been compounded to save space.  The intention is that the statements are executed from left to right across
-any given step.
-
-The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively.  From these smaller
-integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required.
-
-The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively.  The relation $w_1, w_2$ and $w_3$ correspond
-to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively.  These are found using logical shifts to independently find
-$f(y)$ and $g(y)$ which significantly speeds up the algorithm.
-
-After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients 
-$w_1, w_2$ and $w_3$ to be isolated.  The steps 18 through 25 perform the system reduction required as previously described.  Each step of
-the reduction represents the comparable matrix operation that would be performed had this been performed by pencil.  For example, step 18 indicates
-that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$.  
-
-Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known.  By substituting $\beta^{k}$ for $x$, the integer 
-result $a \cdot b$ is produced.
-
-\index{bn\_mp\_toom\_mul.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_toom\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* multiplication using the Toom-Cook 3-way algorithm */
-018   int 
-019   mp_toom_mul(mp_int *a, mp_int *b, mp_int *c)
-020   \{
-021       mp_int w0, w1, w2, w3, w4, tmp1, tmp2, a0, a1, a2, b0, b1, b2;
-022       int res, B;
-023           
-024       /* init temps */
-025       if ((res = mp_init_multi(&w0, &w1, &w2, &w3, &w4, 
-026                                &a0, &a1, &a2, &b0, &b1, 
-027                                &b2, &tmp1, &tmp2, NULL)) != MP_OKAY) \{
-028          return res;
-029       \}
-030       
-031       /* B */
-032       B = MIN(a->used, b->used) / 3;
-033       
-034       /* a = a2 * B**2 + a1 * B + a0 */
-035       if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{
-036          goto ERR;
-037       \}
-038   
-039       if ((res = mp_copy(a, &a1)) != MP_OKAY) \{
-040          goto ERR;
-041       \}
-042       mp_rshd(&a1, B);
-043       mp_mod_2d(&a1, DIGIT_BIT * B, &a1);
-044   
-045       if ((res = mp_copy(a, &a2)) != MP_OKAY) \{
-046          goto ERR;
-047       \}
-048       mp_rshd(&a2, B*2);
-049       
-050       /* b = b2 * B**2 + b1 * B + b0 */
-051       if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{
-052          goto ERR;
-053       \}
-054   
-055       if ((res = mp_copy(b, &b1)) != MP_OKAY) \{
-056          goto ERR;
-057       \}
-058       mp_rshd(&b1, B);
-059       mp_mod_2d(&b1, DIGIT_BIT * B, &b1);
-060   
-061       if ((res = mp_copy(b, &b2)) != MP_OKAY) \{
-062          goto ERR;
-063       \}
-064       mp_rshd(&b2, B*2);
-065       
-066       /* w0 = a0*b0 */
-067       if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{
-068          goto ERR;
-069       \}
-070       
-071       /* w4 = a2 * b2 */
-072       if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{
-073          goto ERR;
-074       \}
-075       
-076       /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */
-077       if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{
-078          goto ERR;
-079       \}
-080       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-081          goto ERR;
-082       \}
-083       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-084          goto ERR;
-085       \}
-086       if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{
-087          goto ERR;
-088       \}
-089       
-090       if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{
-091          goto ERR;
-092       \}
-093       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-094          goto ERR;
-095       \}
-096       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-097          goto ERR;
-098       \}
-099       if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{
-100          goto ERR;
-101       \}
-102       
-103       if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{
-104          goto ERR;
-105       \}
-106       
-107       /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */
-108       if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{
-109          goto ERR;
-110       \}
-111       if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{
-112          goto ERR;
-113       \}
-114       if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{
-115          goto ERR;
-116       \}
-117       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-118          goto ERR;
-119       \}
-120       
-121       if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{
-122          goto ERR;
-123       \}
-124       if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{
-125          goto ERR;
-126       \}
-127       if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{
-128          goto ERR;
-129       \}
-130       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-131          goto ERR;
-132       \}
-133       
-134       if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{
-135          goto ERR;
-136       \}
-137       
-138   
-139       /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */
-140       if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{
-141          goto ERR;
-142       \}
-143       if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{
-144          goto ERR;
-145       \}
-146       if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{
-147          goto ERR;
-148       \}
-149       if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{
-150          goto ERR;
-151       \}
-152       if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{
-153          goto ERR;
-154       \}
-155       
-156       /* now solve the matrix 
-157       
-158          0  0  0  0  1
-159          1  2  4  8  16
-160          1  1  1  1  1
-161          16 8  4  2  1
-162          1  0  0  0  0
-163          
-164          using 12 subtractions, 4 shifts, 
-165                 2 small divisions and 1 small multiplication 
-166        */
-167        
-168        /* r1 - r4 */
-169        if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{
-170           goto ERR;
-171        \}
-172        /* r3 - r0 */
-173        if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{
-174           goto ERR;
-175        \}
-176        /* r1/2 */
-177        if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{
-178           goto ERR;
-179        \}
-180        /* r3/2 */
-181        if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{
-182           goto ERR;
-183        \}
-184        /* r2 - r0 - r4 */
-185        if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{
-186           goto ERR;
-187        \}
-188        if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{
-189           goto ERR;
-190        \}
-191        /* r1 - r2 */
-192        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-193           goto ERR;
-194        \}
-195        /* r3 - r2 */
-196        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-197           goto ERR;
-198        \}
-199        /* r1 - 8r0 */
-200        if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{
-201           goto ERR;
-202        \}
-203        if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{
-204           goto ERR;
-205        \}
-206        /* r3 - 8r4 */
-207        if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{
-208           goto ERR;
-209        \}
-210        if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{
-211           goto ERR;
-212        \}
-213        /* 3r2 - r1 - r3 */
-214        if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{
-215           goto ERR;
-216        \}
-217        if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{
-218           goto ERR;
-219        \}
-220        if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{
-221           goto ERR;
-222        \}
-223        /* r1 - r2 */
-224        if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{
-225           goto ERR;
-226        \}
-227        /* r3 - r2 */
-228        if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{
-229           goto ERR;
-230        \}
-231        /* r1/3 */
-232        if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{
-233           goto ERR;
-234        \}
-235        /* r3/3 */
-236        if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{
-237           goto ERR;
-238        \}
-239        
-240        /* at this point shift W[n] by B*n */
-241        if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{
-242           goto ERR;
-243        \}
-244        if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{
-245           goto ERR;
-246        \}
-247        if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{
-248           goto ERR;
-249        \}
-250        if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{
-251           goto ERR;
-252        \}     
-253        
-254        if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{
-255           goto ERR;
-256        \}
-257        if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{
-258           goto ERR;
-259        \}
-260        if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{
-261           goto ERR;
-262        \}
-263        if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{
-264           goto ERR;
-265        \}     
-266        
-267   ERR:
-268        mp_clear_multi(&w0, &w1, &w2, &w3, &w4, 
-269                       &a0, &a1, &a2, &b0, &b1, 
-270                       &b2, &tmp1, &tmp2, NULL);
-271        return res;
-272   \}     
-273        
-\end{alltt}
-\end{small}
-
--- Comments to be added during editing phase.
-
-\subsection{Signed Multiplication}
-Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required.  So far all
-of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_mul}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_int $b$ \\
-\textbf{Output}.  $c \leftarrow a \cdot b$ \\
-\hline \\
-1.  If $a.sign = b.sign$ then \\
-\hspace{3mm}1.1  $sign = MP\_ZPOS$ \\
-2.  else \\
-\hspace{3mm}2.1  $sign = MP\_ZNEG$ \\
-3.  If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then  \\
-\hspace{3mm}3.1  $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\
-4.  else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\
-\hspace{3mm}4.1  $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\
-5.  else \\
-\hspace{3mm}5.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}5.2  If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\
-\hspace{6mm}5.2.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs.  \\
-\hspace{3mm}5.3  else \\
-\hspace{6mm}5.3.1  $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs.  \\
-6.  $c.sign \leftarrow sign$ \\
-7.  Return the result of the unsigned multiplication performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_mul}
-\end{figure}
-
-\textbf{Algorithm mp\_mul.}
-This algorithm performs the signed multiplication of two inputs.  It will make use of any of the three unsigned multiplication algorithms 
-available when the input is of appropriate size.  The \textbf{sign} of the result is not set until the end of the algorithm since algorithm
-s\_mp\_mul\_digs will clear it.  
-
-\index{bn\_mp\_mul.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* high level multiplication (handles sign) */
-018   int
-019   mp_mul (mp_int * a, mp_int * b, mp_int * c)
-020   \{
-021     int     res, neg;
-022     neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG;
-023     
-024     if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{
-025       res = mp_toom_mul(a, b, c);
-026     \} else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{
-027       res = mp_karatsuba_mul (a, b, c);
-028     \} else \{
-029   
-030       /* can we use the fast multiplier?
-031        *
-032        * The fast multiplier can be used if the output will 
-033        * have less than MP_WARRAY digits and the number of 
-034        * digits won't affect carry propagation
-035        */
-036       int     digs = a->used + b->used + 1;
-037   
-038       if ((digs < MP_WARRAY) &&
-039           MIN(a->used, b->used) <= 
-040           (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-041         res = fast_s_mp_mul_digs (a, b, c, digs);
-042       \} else \{
-043         res = s_mp_mul (a, b, c);
-044       \}
-045   
-046     \}
-047     c->sign = neg;
-048     return res;
-049   \}
-\end{alltt}
-\end{small}
-
-The implementation is rather simplistic and is not particularly noteworthy.  Line 22 computes the sign of the result using the ``?'' 
-operator from the C programming language.  Line 40 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$.  
-
-\section{Squaring}
-
-Squaring is a special case of multiplication where both multiplicands are equal.  At first it may seem like there is no significant optimization
-available but in fact there is.  Consider the multiplication of $576$ against $241$.  In total there will be nine single precision multiplications
-performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot  6$, $2 \cdot 7$ and $2 \cdot 5$.  Now consider 
-the multiplication of $123$ against $123$.  The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, 
-$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$.  On closer inspection some of the products are equivalent.  For example, $3 \cdot 2 = 2 \cdot 3$ 
-and $3 \cdot 1 = 1 \cdot 3$. 
-
-For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$
-required for multiplication.  The following diagram gives an example of the operations required.
-
-\begin{figure}[here]
-\begin{center}
-\begin{tabular}{ccccc|c}
-&&1&2&3&\\
-$\times$ &&1&2&3&\\
-\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\
-       & $2 \cdot 1$  & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\
-         $1 \cdot 1$  & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\
-\end{tabular}
-\end{center}
-\caption{Squaring Optimization Diagram}
-\end{figure}
-
-Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious.  For the purposes of this discussion let $x$
-represent the number being squared.  The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it.  
-
-The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product.  Every non-square term of a column will
-appear twice hence the name ``double product''.  Every odd column is made up entirely of double products.  In fact every column is made up of double 
-products and at most one square (\textit{see the exercise section}).  
-
-The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, 
-occurs at column $2k + 1$.  For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. 
-Column two of row one is a square and column three is the first unique column.
-
-\subsection{The Baseline Squaring Algorithm}
-The baseline squaring algorithm is meant to be a catch-all squaring algorithm.  It will handle any of the input sizes that the faster routines
-will not handle.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits.  (\textit{mp\_init\_size}) \\
-2.  If step 1 failed return(\textit{MP\_MEM}) \\
-3.  $t.used \leftarrow 2 \cdot a.used + 1$ \\
-4.  For $ix$ from 0 to $a.used - 1$ do \\
-\hspace{3mm}Calculate the square. \\
-\hspace{3mm}4.1  $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\
-\hspace{3mm}4.2  $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}Calculate the double products after the square. \\
-\hspace{3mm}4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}4.4  For $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.4.1  $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\
-\hspace{6mm}4.4.2  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.4.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}Set the last carry. \\
-\hspace{3mm}4.5  While $u > 0$ do \\
-\hspace{6mm}4.5.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}4.5.2  $\hat r \leftarrow t_{ix + iy} + u$ \\
-\hspace{6mm}4.5.3  $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}4.5.4  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  Clamp excess digits of $t$.  (\textit{mp\_clamp}) \\
-6.  Exchange $b$ and $t$. \\
-7.  Clear $t$ (\textit{mp\_clear}) \\
-8.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_sqr.}
-This algorithm computes the square of an input using the three observations on squaring.  It is based fairly faithfully on  algorithm 14.16 of HAC
-\cite[pp.596-597]{HAC}.  Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring.  This allows the 
-destination mp\_int to be the same as the source mp\_int.
-
-The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while
-the inner loop computes the columns of the partial result.  Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate
-the carry and compute the double products.  
-
-The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this
-very algorithm.  The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that
-when it is multiplied by two, it can be properly represented by a mp\_word.
-
-Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial 
-results calculated so far.  This involves expensive carry propagation which will be eliminated in the next algorithm.  
-
-\index{bn\_s\_mp\_sqr.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */
-018   int
-019   s_mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     mp_int  t;
-022     int     res, ix, iy, pa;
-023     mp_word r;
-024     mp_digit u, tmpx, *tmpt;
-025   
-026     pa = a->used;
-027     if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{
-028       return res;
-029     \}
-030     t.used = 2*pa + 1;
-031   
-032     for (ix = 0; ix < pa; ix++) \{
-033       /* first calculate the digit at 2*ix */
-034       /* calculate double precision result */
-035       r = ((mp_word) t.dp[2*ix]) + 
-036           ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-037   
-038       /* store lower part in result */
-039       t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK));
-040   
-041       /* get the carry */
-042       u = (r >> ((mp_word) DIGIT_BIT));
-043   
-044       /* left hand side of A[ix] * A[iy] */
-045       tmpx = a->dp[ix];
-046   
-047       /* alias for where to store the results */
-048       tmpt = t.dp + (2*ix + 1);
-049       
-050       for (iy = ix + 1; iy < pa; iy++) \{
-051         /* first calculate the product */
-052         r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]);
-053   
-054         /* now calculate the double precision result, note we use
-055          * addition instead of *2 since it's easier to optimize
-056          */
-057         r = ((mp_word) * tmpt) + r + r + ((mp_word) u);
-058   
-059         /* store lower part */
-060         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-061   
-062         /* get carry */
-063         u = (r >> ((mp_word) DIGIT_BIT));
-064       \}
-065       /* propagate upwards */
-066       while (u != ((mp_digit) 0)) \{
-067         r = ((mp_word) * tmpt) + ((mp_word) u);
-068         *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK));
-069         u = (r >> ((mp_word) DIGIT_BIT));
-070       \}
-071     \}
-072   
-073     mp_clamp (&t);
-074     mp_exch (&t, b);
-075     mp_clear (&t);
-076     return MP_OKAY;
-077   \}
-\end{alltt}
-\end{small}
-
-Inside the outer loop (\textit{see line 32}) the square term is calculated on line 35.  Line 42 extracts the carry from the square
-term.  Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 45 and 48 respectively.  The doubling is performed using two
-additions (\textit{see line 57}) since it is usually faster than shifting,if not at least as fast.  
-
-\subsection{Faster Squaring by the ``Comba'' Method}
-A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop.  Squaring has an additional
-drawback that it must double the product inside the inner loop as well.  As for multiplication, the Comba technique can be used to eliminate these
-performance hazards.
-
-The first obvious solution is to make an array of mp\_words which will hold all of the columns.  This will indeed eliminate all of the carry
-propagation operations from the inner loop.  However, the inner product must still be doubled $O(n^2)$ times.  The solution stems from the simple fact
-that $2a + 2b + 2c = 2(a + b + c)$.  That is the sum of all of the double products is equal to double the sum of all the products.  For example,
-$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$.  
-
-However, we cannot simply double all of the columns, since the squares appear only once per row.  The most practical solution is to have two mp\_word
-arrays.  One array will hold the squares and the other array will hold the double products.  With both arrays the doubling and carry propagation can be 
-moved to a $O(n)$ work level outside the $O(n^2)$ level.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\
-1.  If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits.  (\textit{mp\_grow}). \\
-2.  If step 1 failed return(\textit{MP\_MEM}). \\
-3.  for $ix$ from $0$ to $2a.used + 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-\hspace{3mm}3.2  $\hat {X}_{ix} \leftarrow 0$ \\
-4.  for $ix$ from $0$ to $a.used - 1$ do \\
-\hspace{3mm}Compute the square.\\
-\hspace{3mm}4.1  $\hat {X}_{ix+ix} \leftarrow \left ( a_ix \right )^2$ \\
-\\
-\hspace{3mm}Compute the double products.\\
-\hspace{3mm}4.2  for $iy$ from $ix + 1$ to $a.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\
-5.  $oldused \leftarrow b.used$ \\
-6.  $b.used \leftarrow 2a.used + 1$ \\
-\\
-Double the products and propagate the carries simultaneously. \\
-7.  $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\
-8.  for $ix$ from $1$ to $2a.used$ do \\
-\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\
-\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\
-\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\
-9.  $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\
-10.  if $2a.used + 1 < oldused$ then do \\
-\hspace{3mm}10.1  for $ix$ from $2a.used + 1$ to $oldused$ do \\
-\hspace{6mm}10.1.1  $b_{ix} \leftarrow 0$ \\
-11.  Clamp excess digits from $b$.  (\textit{mp\_clamp}) \\
-12.  Return(\textit{MP\_OKAY}). \\ 
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_s\_mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm fast\_s\_mp\_sqr.}
-This algorithm computes the square of an input using the Comba technique.  It is designed to be a replacement for algorithm s\_mp\_sqr when
-the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$.  
-
-This routine requires two arrays of mp\_words to be placed on the stack.  The first array $\hat W$ will hold the double products and the second
-array $\hat X$ will hold the squares.  Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most 
-processors to simply make it a full size array.
-
-The loop on step 3 will zero the two arrays to prepare them for the squaring step.  Step 4.1 computes the squares of the product.  Note how 
-it simply assigns the value into the $\hat X$ array.  The nested loop on step 4.2 computes the doubles of the products.  This loop
-computes the sum of the products for each column.  They are not doubled until later.
-
-After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards.  It makes sense to do both
-operations at the same time.  The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the
-squares in place.  
-
-\index{bn\_fast\_s\_mp\_sqr.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* fast squaring
-018    *
-019    * This is the comba method where the columns of the product 
-020    * are computed first then the carries are computed.  This 
-021    * has the effect of making a very simple inner loop that 
-022    * is executed the most
-023    *
-024    * W2 represents the outer products and W the inner.
-025    *
-026    * A further optimizations is made because the inner 
-027    * products are of the form "A * B * 2".  The *2 part does 
-028    * not need to be computed until the end which is good 
-029    * because 64-bit shifts are slow!
-030    *
-031    * Based on Algorithm 14.16 on pp.597 of HAC.
-032    *
-033    */
-034   int
-035   fast_s_mp_sqr (mp_int * a, mp_int * b)
-036   \{
-037     int     olduse, newused, res, ix, pa;
-038     mp_word W2[MP_WARRAY], W[MP_WARRAY];
-039   
-040     /* calculate size of product and allocate as required */
-041     pa = a->used;
-042     newused = pa + pa + 1;
-043     if (b->alloc < newused) \{
-044       if ((res = mp_grow (b, newused)) != MP_OKAY) \{
-045         return res;
-046       \}
-047     \}
-048   
-049     /* zero temp buffer (columns)
-050      * Note that there are two buffers.  Since squaring requires
-051      * a outter and inner product and the inner product requires
-052      * computing a product and doubling it (a relatively expensive
-053      * op to perform n**2 times if you don't have to) the inner and
-054      * outer products are computed in different buffers.  This way
-055      * the inner product can be doubled using n doublings instead of
-056      * n**2
-057      */
-058     memset (W, 0, newused * sizeof (mp_word));
-059     memset (W2, 0, newused * sizeof (mp_word));
-060   
-061     /* This computes the inner product.  To simplify the inner N**2 loop
-062      * the multiplication by two is done afterwards in the N loop.
-063      */
-064     for (ix = 0; ix < pa; ix++) \{
-065       /* compute the outer product
-066        *
-067        * Note that every outer product is computed
-068        * for a particular column only once which means that
-069        * there is no need todo a double precision addition
-070        */
-071       W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]);
-072   
-073       \{
-074         register mp_digit tmpx, *tmpy;
-075         register mp_word *_W;
-076         register int iy;
-077   
-078         /* copy of left side */
-079         tmpx = a->dp[ix];
-080   
-081         /* alias for right side */
-082         tmpy = a->dp + (ix + 1);
-083   
-084         /* the column to store the result in */
-085         _W = W + (ix + ix + 1);
-086   
-087         /* inner products */
-088         for (iy = ix + 1; iy < pa; iy++) \{
-089             *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++);
-090         \}
-091       \}
-092     \}
-093   
-094     /* setup dest */
-095     olduse  = b->used;
-096     b->used = newused;
-097   
-098     /* now compute digits */
-099     \{
-100       register mp_digit *tmpb;
-101   
-102       /* double first value, since the inner products are 
-103        * half of what they should be 
-104        */
-105       W[0] += W[0] + W2[0];
-106   
-107       tmpb = b->dp;
-108       for (ix = 1; ix < newused; ix++) \{
-109         /* double/add next digit */
-110         W[ix] += W[ix] + W2[ix];
-111   
-112         W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT));
-113         *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK));
-114       \}
-115       /* set the last value.  Note even if the carry is zero 
-116        * this is required since the next step will not zero 
-117        * it if b originally had a value at b->dp[2*a.used]
-118        */
-119       *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK));
-120   
-121       /* clear high digits */
-122       for (; ix < olduse; ix++) \{
-123         *tmpb++ = 0;
-124       \}
-125     \}
-126   
-127     mp_clamp (b);
-128     return MP_OKAY;
-129   \}
-\end{alltt}
-\end{small}
-
--- Write something deep and insightful later, Tom.
-
-\subsection{Polynomial Basis Squaring}
-The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring.  The minor exception
-is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$.  Instead of performing $2n + 1$
-multiplications to find the $\zeta$ relations, squaring operations are performed instead.  
-
-\subsection{Karatsuba Squaring}
-Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square.  
-Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial.  The Karatsuba equation can be modified to square a 
-number with the following equation.
-
-\begin{equation}
-h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2
-\end{equation}
-
-Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$.  As in 
-Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of 
-$O \left ( n^{lg(3)} \right )$.
-
-You might ask yourself, if the asymptotic time of Karatsuba squaring and multiplication is the same, why not simply use the multiplication algorithm 
-instead?  The answer to this arises from the cutoff point for squaring.  As in multiplication there exists a cutoff point, at which the 
-time required for a Comba based squaring and a Karatsuba based squaring meet.  Due to the overhead inherent in the Karatsuba method, the cutoff 
-point is fairly high.  For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits.  
-
-Consider squaring a 200 digit number with this technique.  It will be split into two 100 digit halves which are subsequently squared.  
-The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm.  If Karatsuba multiplication
-were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  Initialize the following temporary mp\_ints:  $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\
-2.  If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\
-\\
-Split the input.  e.g. $a = x1\beta^B + x0$ \\
-3.  $B \leftarrow \lfloor a.used / 2 \rfloor$ \\
-4.  $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-5.  $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\
-\\
-Calculate the three squares. \\
-6.  $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\
-7.  $x1x1 \leftarrow x1^2$ \\
-8.  $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\
-9.  $t1 \leftarrow t1^2$ \\
-\\
-Compute the middle term. \\
-10.  $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\
-11.  $t1 \leftarrow t2 - t1$ \\
-\\
-Compute final product. \\
-12.  $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\
-13.  $x1x1 \leftarrow x1x1\beta^{2B}$ \\
-14.  $t1 \leftarrow t1 + x0x0$ \\
-15.  $b \leftarrow t1 + x1x1$ \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_karatsuba\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_karatsuba\_sqr.}
-This algorithm computes the square of an input $a$ using the Karatsuba technique.  This algorithm is very similar to the Karatsuba based
-multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings.
-
-The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is
-placed just below the middle.  Step 3, 4 and 5 compute the two halves required using $B$
-as the radix point.  The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form.
-
-By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$.
-Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then
-this method is faster.  Assuming no further recursions occur, the difference can be estimated with the following inequality.
-
-Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or
-machine clock cycles.}. 
-
-\begin{equation}
-5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2
-\end{equation}
-
-For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$.  This implies that the following inequality should hold.
-\begin{center}
-\begin{tabular}{rcl}
-${5n \over 3} + 3n^2 + 3n$     & $<$ & ${n \over 3} + 6n^2$ \\
-${5 \over 3} + 3n + 3$     & $<$ & ${1 \over 3} + 6n$ \\
-${13 \over 9}$     & $<$ & $n$ \\
-\end{tabular}
-\end{center}
-
-This results in a cutoff point around $n = 2$.  As a consequence it is actually faster to compute the middle term the ``long way'' on processors
-where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication.  On
-the Intel P4 processor this ratio is 1:29 making this method even more beneficial.  The only common exception is the ARMv4 processor which has a
-ratio of 1:7.  } than simpler operations such as addition.  
-
-\index{bn\_mp\_karatsuba\_sqr.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* Karatsuba squaring, computes b = a*a using three 
-018    * half size squarings
-019    *
-020    * See comments of mp_karatsuba_mul for details.  It 
-021    * is essentially the same algorithm but merely 
-022    * tuned to perform recursive squarings.
-023    */
-024   int
-025   mp_karatsuba_sqr (mp_int * a, mp_int * b)
-026   \{
-027     mp_int  x0, x1, t1, t2, x0x0, x1x1;
-028     int     B, err;
-029   
-030     err = MP_MEM;
-031   
-032     /* min # of digits */
-033     B = a->used;
-034   
-035     /* now divide in two */
-036     B = B / 2;
-037   
-038     /* init copy all the temps */
-039     if (mp_init_size (&x0, B) != MP_OKAY)
-040       goto ERR;
-041     if (mp_init_size (&x1, a->used - B) != MP_OKAY)
-042       goto X0;
-043   
-044     /* init temps */
-045     if (mp_init_size (&t1, a->used * 2) != MP_OKAY)
-046       goto X1;
-047     if (mp_init_size (&t2, a->used * 2) != MP_OKAY)
-048       goto T1;
-049     if (mp_init_size (&x0x0, B * 2) != MP_OKAY)
-050       goto T2;
-051     if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY)
-052       goto X0X0;
-053   
-054     \{
-055       register int x;
-056       register mp_digit *dst, *src;
-057   
-058       src = a->dp;
-059   
-060       /* now shift the digits */
-061       dst = x0.dp;
-062       for (x = 0; x < B; x++) \{
-063         *dst++ = *src++;
-064       \}
-065   
-066       dst = x1.dp;
-067       for (x = B; x < a->used; x++) \{
-068         *dst++ = *src++;
-069       \}
-070     \}
-071   
-072     x0.used = B;
-073     x1.used = a->used - B;
-074   
-075     mp_clamp (&x0);
-076   
-077     /* now calc the products x0*x0 and x1*x1 */
-078     if (mp_sqr (&x0, &x0x0) != MP_OKAY)
-079       goto X1X1;           /* x0x0 = x0*x0 */
-080     if (mp_sqr (&x1, &x1x1) != MP_OKAY)
-081       goto X1X1;           /* x1x1 = x1*x1 */
-082   
-083     /* now calc (x1-x0)**2 */
-084     if (mp_sub (&x1, &x0, &t1) != MP_OKAY)
-085       goto X1X1;           /* t1 = x1 - x0 */
-086     if (mp_sqr (&t1, &t1) != MP_OKAY)
-087       goto X1X1;           /* t1 = (x1 - x0) * (x1 - x0) */
-088   
-089     /* add x0y0 */
-090     if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY)
-091       goto X1X1;           /* t2 = x0x0 + x1x1 */
-092     if (mp_sub (&t2, &t1, &t1) != MP_OKAY)
-093       goto X1X1;           /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */
-094   
-095     /* shift by B */
-096     if (mp_lshd (&t1, B) != MP_OKAY)
-097       goto X1X1;           /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<<B */
-098     if (mp_lshd (&x1x1, B * 2) != MP_OKAY)
-099       goto X1X1;           /* x1x1 = x1x1 << 2*B */
-100   
-101     if (mp_add (&x0x0, &t1, &t1) != MP_OKAY)
-102       goto X1X1;           /* t1 = x0x0 + t1 */
-103     if (mp_add (&t1, &x1x1, b) != MP_OKAY)
-104       goto X1X1;           /* t1 = x0x0 + t1 + x1x1 */
-105   
-106     err = MP_OKAY;
-107   
-108   X1X1:mp_clear (&x1x1);
-109   X0X0:mp_clear (&x0x0);
-110   T2:mp_clear (&t2);
-111   T1:mp_clear (&t1);
-112   X1:mp_clear (&x1);
-113   X0:mp_clear (&x0);
-114   ERR:
-115     return err;
-116   \}
-\end{alltt}
-\end{small}
-
-This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul.  It uses the same inline style to copy and 
-shift the input into the two halves.  The loop from line 54 to line 70 has been modified since only one input exists.  The \textbf{used}
-count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin.  At this point $x1$ and $x0$ are valid equivalents
-to the respective halves as if mp\_rshd and mp\_mod\_2d had been used.  
-
-By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered.  On the Athlon the cutoff point
-is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}).  On slower processors such as the Intel P4
-it is actually below the Comba limit (\textit{at 110 digits}).
-
-This routine uses the same error trap coding style as mp\_karatsuba\_sqr.  As the temporary variables are initialized errors are redirected to
-the error trap higher up.  If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally.
-
-\textit{Last paragraph sucks.  re-write! -- Tom}
-
-\subsection{Toom-Cook Squaring}
-The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used
-instead of multiplication to find the five relations..  The reader is encouraged to read the description of the latter algorithm and try to 
-derive their own Toom-Cook squaring algorithm.  
-
-\subsection{High Level Squaring}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_sqr}. \\
-\textbf{Input}.   mp\_int $a$ \\
-\textbf{Output}.  $b \leftarrow a^2$ \\
-\hline \\
-1.  If $a.used \ge TOOM\_SQR\_CUTOFF$ then  \\
-\hspace{3mm}1.1  $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\
-2.  else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\
-\hspace{3mm}2.1  $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\
-3.  else \\
-\hspace{3mm}3.1  $digs \leftarrow a.used + b.used + 1$ \\
-\hspace{3mm}3.2  If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\
-\hspace{6mm}3.2.1  $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr.  \\
-\hspace{3mm}3.3  else \\
-\hspace{6mm}3.3.1  $b \leftarrow a^2$ using algorithm s\_mp\_sqr.  \\
-4.  $b.sign \leftarrow MP\_ZPOS$ \\
-5.  Return the result of the unsigned squaring performed. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_sqr}
-\end{figure}
-
-\textbf{Algorithm mp\_sqr.}
-This algorithm computes the square of the input using one of four different algorithms.  If the input is very large and has at least
-\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used.  If
-neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used.  
-
-\index{bn\_mp\_sqr.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_sqr.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* computes b = a*a */
-018   int
-019   mp_sqr (mp_int * a, mp_int * b)
-020   \{
-021     int     res;
-022     if (a->used >= TOOM_SQR_CUTOFF) \{
-023       res = mp_toom_sqr(a, b);
-024     \} else if (a->used >= KARATSUBA_SQR_CUTOFF) \{
-025       res = mp_karatsuba_sqr (a, b);
-026     \} else \{
-027   
-028       /* can we use the fast multiplier? */
-029       if ((a->used * 2 + 1) < MP_WARRAY && 
-030            a->used < 
-031            (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{
-032         res = fast_s_mp_sqr (a, b);
-033       \} else \{
-034         res = s_mp_sqr (a, b);
-035       \}
-036     \}
-037     b->sign = MP_ZPOS;
-038     return res;
-039   \}
-\end{alltt}
-\end{small}
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\
-                      & that have different number of digits in Karatsuba multiplication. \\
-                      & \\
-$\left [ 3 \right ] $ & In section 6.3 the fact that every column of a squaring is made up \\
-                      & of double products and at most one square is stated.  Prove this statement. \\
-                      & \\                      
-$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\
-                      & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\
-                      & \\
-$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\
-                      & \\
-$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\
-                      & \\ 
-$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\
-                      & required for equation $6.7$ to be true.  \\
-                      & \\
-\end{tabular}
-
-\chapter{Modular Reduction}
-\section{Basics of Modular Reduction}
-\index{modular residue}
-Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, 
-such as factoring.  Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set.  A number $a$ is said to be reduced 
-modulo another number $b$ by finding the remainder of the division $a/b$.  
-
-Modular reduction is equivalent to solving for $r$ in the following equation.  $a = bq + r$ where $q = \lfloor a/b \rfloor$.  The result 
-$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$.  In other vernacular $r$ is known as the 
-``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and
-other forms of residues.  
-
-\index{modulus}
-Modular reductions are normally used to form finite groups such as fields and rings.  For example, in the RSA public key algorithm \cite{RSAPAPER} 
-two private primes $p$ and $q$ are chosen which when multiplied $n = pq$ forms a composite modulus.  When operations such as multiplication and
-squaring are performed on units of the ring $\Z_n$ a finite multiplicative sub-group is formed.
-
-Modular reductions have a variety of other useful properties.  For example, a number $x$ is a square if and only if it is a quadratic
-residue modulo a prime.  With a finite set of primes $B = \left < p_0, p_1, \ldots, p_n \right >$ a quick test for whether $x$ is square or not can 
-be performed\footnote{Provided none of the primes from $B$ divide $x$.}.  Consider the figure~\ref{fig:QR} with the candiate $x = 955621$ a simple 
-set of modular reductions modulo $3, 5, \ldots, 11$ may detect whether $x$ is a square or not.  In this case $955621 \equiv 7 \mbox{ (mod }11\mbox{)}$ 
-and since $7$ is not a quadratic residue modulo $11$ the number $955621$ is not a square.  
-
-\begin{figure}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Prime} & \textbf{Quadratic Residues} \\
-\hline $3$            & $1$ \\
-\hline $5$            & $1, 4$ \\
-\hline $7$            & $1, 2, 4$ \\
-\hline $11$           & $1, 3, 4, 5, 9$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Quadratic Residues for primes less than $13$}
-\label{fig:QR}
-\end{figure}
-
-The most common usage for performance driven modular reductions is in modular exponentiation algorithms.  That is to compute 
-$d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible.  As will be discussed in the subsequent chapter there exists fast algorithms for computing
-modular exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications.  These algorithms will produce partial
-results in the range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms.
-
-\section{The Barrett Reduction}
-The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate
-division.  Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to 
-
-\begin{equation}
-c = a - b \cdot \lfloor a/b \rfloor
-\end{equation}
-
-Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP intuition would indicate the next step 
-would be to replace $a/b$ by a multiplication by the reciprocal.  However, DSP intuition on its own will not work as these numbers are considerably
-larger than the precision of common DSP floating point data types.  It would take another common optimization to optimize the algorithm.
-
-\subsection{Fixed Point Arithmetic}
-The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers.  Fixed
-point arithmetic would vastly popularlize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were fairly slow.  The idea behind
-fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit integer and a $q$-bit fraction part 
-(\textit{where $p+q = k$}).  
-
-In this system a $k$-bit integer $n$ would actually represent $n/2^q$.  For example, with $q = 4$ the integer $n = 37$ would actually represent the
-value $2.3125$.  To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized. For example, 
-with $q = 4$ to multiply the integers $9$ and $5$ they must be converted to fixed point first by multiplying by $2^q$.  Let $a = 9(2^q)$ 
-represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the fixed point representation of $5$.  The product $ab$ is equal to
-$45(2^{2q})$ which when normalized produces $45(2^q)$.  
-
-Using fixed point arithmetic division can be easily achieved by multiplying by the reciprocal.  If $2^q$ is equivalent to one than $2^q/b$ is 
-equivalent to $1/b$ using real arithmetic.  Using this fact dividing an integer $a$ by another integer $b$ can be achieved with the following
-expression.
-
-\begin{equation}
-\lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor
-\end{equation}
-
-The precision of the division is proportional to the value of $q$.  If the divisor $b$ is used frequently as is the case with 
-modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift.  Both operations
-are considerably faster than division on most processors.  
-
-Consider dividing $19$ by $5$.  The correct result is $\lfloor 19/5 \rfloor = 3$.  With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which
-leads to a product of $19$ which when divided by $2^q$ produces $2$.  However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and
-the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct.  
-
-Plugging this form of divison into the original equation the following modular residue equation arises.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor
-\end{equation}
-
-Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol.  Using the $\mu$
-variable also helps re-inforce the idea that it is meant to be computed once and re-used.
-
-\begin{equation}
-c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor
-\end{equation}
-
-Provided that $2^q > b^2$ this algorithm will produce a quotient that is either exactly correct or off by a value of one.  Let $n$ represent
-the number of digits in $b$.  This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and 
-another $n^2$ single precision multiplications to find the residue.  In total $3n^2$ single precision multiplications are required to 
-reduce the number.  
-
-For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$.  Consider reducing
-$a = 180388626447$ modulo $b$ using the above reduction equation.  The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$.
-By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found.
-
-\subsection{Choosing a Radix Point}
-Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications.  If that were the best
-that could be achieved a full division might as well be used in its place.  The key to optimizing the reduction is to reduce the precision of
-the initial multiplication that finds the quotient.  
-
-Let $a$ represent the number of which the residue is sought.  Let $b$ represent the modulus used to find the residue.  Let $m$ represent
-the number of digits in $b$.  For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$.  Dividing $a$ by 
-$b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer.  Digits below the $m - 1$'th digit of $a$ will contribute at most a value
-of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$.  
-
-Since those digits do not contribute much to the quotient the observation is that they might as well be zero.  However, if the digits 
-``might as well be zero'' they might as well not be there in the first place.  Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input
-with the zeroes trimmed.  Now the modular reduction is trimmed to the almost equivalent equation
-
-\begin{equation}
-c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor
-\end{equation}
-
-Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$. Also note that the exponent on the divisor when added to the amount $q_0$
-was shifted by equals $2m$.  If the optimization had not been performed the divisor would have the exponent $2m$ so in the end the exponents
-do ``add up''. Using the above equation the quotient $\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most 
-two implying that $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  By first subtracting $b$ times the quotient and then 
-conditionally subtracting $b$ once or twice the residue is found.
-
-The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single
-precision multiplications.  In total $2m^2 + m$ single precision multiplications are required which is considerably faster than the original
-attempt.
-
-For example, let $\beta = 10$ represent the radix of the digits.  Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ 
-represent the value of which the residue is desired.  In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$.  
-With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$.  The quotient is then 
-$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$.  Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ 
-is found.  
-
-\subsection{Trimming the Quotient}
-So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications.  As 
-it stands now the algorithm is already fairly fast compared to a full integer division algorithm.  However, there is still room for
-optimization.  
-
-After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower
-half of the product.  It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision 
-multiplications.  If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly.  
-In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed.  
-
-The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number.  Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision
-multiplications would be required.  Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number
-of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications.  
-
-\subsection{Trimming the Residue}
-After the quotient has been calculated it is used to reduce the input.  As previously noted the algorithm is not exact and it can be off by a small
-multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$.  If $b$ is $m$ digits than the 
-result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are
-implicitly zero.  
-
-The next optimization arises from this very fact.  Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full
-$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed.  Similarly the value of $a$ can
-be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well.  A multiplication that produces 
-only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications.  
-
-With both optimizations in place the algorithm is the algorithm Barrett proposed.  It requires $m^2 + 2m - 1$ single precision multiplications which
-is considerably faster than the straightforward $3m^2$ method.  
-
-\subsection{The Barrett Algorithm}
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce}. \\
-\textbf{Input}.   mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor$ $(0 \le a < b^2, b > 1)$ \\
-\textbf{Output}.  $c \leftarrow a \mbox{ (mod }b\mbox{)}$ \\
-\hline \\
-Let $m$ represent the number of digits in $b$.  \\
-1.  Make a copy of $a$ and store it in $q$.  (\textit{mp\_init\_copy}) \\
-2.  $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\
-\\
-Produce the quotient. \\
-3.  $q \leftarrow q \cdot \mu$  (\textit{note: only produce digits at or above $m-1$}) \\
-4.  $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\
-\\
-Subtract the multiple of modulus from the input. \\
-5.  $c \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-6.  $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\
-7.  $c \leftarrow c - q$ (\textit{mp\_sub}) \\
-\\
-Add $\beta^{m+1}$ if a carry occured. \\
-8.  If $c < 0$ then (\textit{mp\_cmp\_d}) \\
-\hspace{3mm}8.1  $q \leftarrow 1$ (\textit{mp\_set}) \\
-\hspace{3mm}8.2  $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\
-\hspace{3mm}8.3  $c \leftarrow c + q$ \\
-\\
-Now subtract the modulus if the residue is too large (e.g. quotient too small). \\
-9.  While $c \ge b$ do (\textit{mp\_cmp}) \\
-\hspace{3mm}9.1  $c \leftarrow c - b$ \\
-10.  Clear $q$. \\
-11.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce.}
-This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm.  It is loosely based on algorithm 14.42 of HAC
-\cite[pp.  602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}.  The algorithm has several restrictions and assumptions which must be adhered to
-for the algorithm to work.
-
-First the modulus $b$ is assumed to be positive and greater than one.  If the modulus were less than or equal to one than subtracting
-a multiple of it would either accomplish nothing or actually enlarge the input.  The input $a$ must be in the range $0 \le a < b^2$ in order
-for the quotient to have enough precision.  Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish.  The
-value of $\mu$ is passed as an argument to this algorithm and is assumed to be setup before the algorithm is used.  
-
-Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position.  An algorithm called 
-$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task.  This optimal algorithm can only be used if the number
-of digits in $b$ is very much smaller than $\beta$.  
-
-After the multiple of the modulus has been subtracted from $a$ the residue must be fixed up in case its negative.  While it is known that 
-$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue.  In this case 
-the invariant $\beta^{m+1}$ must be added to the residue to make it positive again.  
-
-The while loop at step 9 will subtract $b$ until the residue is less than $b$.  If the algorithm is performed correctly this step is only
-performed upto two times.  However, if $a \ge b^2$ than it will iterate substantially more times than it should.
-
-\index{bn\_mp\_reduce.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* reduces x mod m, assumes 0 < x < m**2, mu is 
-018    * precomputed via mp_reduce_setup.
-019    * From HAC pp.604 Algorithm 14.42
-020    */
-021   int
-022   mp_reduce (mp_int * x, mp_int * m, mp_int * mu)
-023   \{
-024     mp_int  q;
-025     int     res, um = m->used;
-026   
-027     /* q = x */
-028     if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{
-029       return res;
-030     \}
-031   
-032     /* q1 = x / b**(k-1)  */
-033     mp_rshd (&q, um - 1);         
-034   
-035     /* according to HAC this is optimization is ok */
-036     if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{
-037       if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{
-038         goto CLEANUP;
-039       \}
-040     \} else \{
-041       if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{
-042         goto CLEANUP;
-043       \}
-044     \}
-045   
-046     /* q3 = q2 / b**(k+1) */
-047     mp_rshd (&q, um + 1);         
-048   
-049     /* x = x mod b**(k+1), quick (no division) */
-050     if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{
-051       goto CLEANUP;
-052     \}
-053   
-054     /* q = q * m mod b**(k+1), quick (no division) */
-055     if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{
-056       goto CLEANUP;
-057     \}
-058   
-059     /* x = x - q */
-060     if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{
-061       goto CLEANUP;
-062     \}
-063   
-064     /* If x < 0, add b**(k+1) to it */
-065     if (mp_cmp_d (x, 0) == MP_LT) \{
-066       mp_set (&q, 1);
-067       if ((res = mp_lshd (&q, um + 1)) != MP_OKAY)
-068         goto CLEANUP;
-069       if ((res = mp_add (x, &q, x)) != MP_OKAY)
-070         goto CLEANUP;
-071     \}
-072   
-073     /* Back off if it's too big */
-074     while (mp_cmp (x, m) != MP_LT) \{
-075       if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{
-076         break;
-077       \}
-078     \}
-079     
-080   CLEANUP:
-081     mp_clear (&q);
-082   
-083     return res;
-084   \}
-\end{alltt}
-\end{small}
-
-The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up.  This essentially halves
-the number of single precision multiplications required.  However, the optimization is only safe if $\beta$ is much larger than the number of digits
-in the modulus.  In the source code this is evaluated on lines 36 to 44 where algorithm s\_mp\_mul\_high\_digs is used when it is
-safe to do so.  
-
-\subsection{The Barrett Setup Algorithm}
-In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance.  Ideally this value should be computed once and stored for
-future use so that the Barrett algorithm can be used without delay.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_setup}. \\
-\textbf{Input}.   mp\_int $a$ ($a > 1$)  \\
-\textbf{Output}.  $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\
-\hline \\
-1.  $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot  m}$ (\textit{mp\_2expt}) \\
-2.  $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\
-3.  Return(\textit{MP\_OKAY}) \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_setup.}
-This algorithm computes the reciprocal $\mu$ required for Barrett reduction.  First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot  m}$ which
-is equivalent and much faster.  The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$.
-
-\index{bn\_mp\_reduce\_setup.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* pre-calculate the value required for Barrett reduction
-018    * For a given modulus "b" it calulates the value required in "a"
-019    */
-020   int
-021   mp_reduce_setup (mp_int * a, mp_int * b)
-022   \{
-023     int     res;
-024     
-025     if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{
-026       return res;
-027     \}
-028     return mp_div (a, b, a, NULL);
-029   \}
-\end{alltt}
-\end{small}
-
-This simple routine calculates the reciprocal $\mu$ required by Barrett reduction.  Note the extended usage of algorithm mp\_div where the variable
-which would received the remainder is passed as NULL.  As will be discussed in section 9.1 the division routine allows both the quotient and the 
-remainder to be passed as NULL meaning to ignore the value.  
-
-\section{The Montgomery Reduction}
-Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting 
-form of reduction in common use.  It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a 
-residue times a constant.  However, as perplexing as this may sound the algorithm is relatively simple and very efficient.  
-
-Throughout this entire section the variable $n$ will represent the modulus used to form the residue.  As will be discussed shortly the value of
-$n$ must be odd.  The variable $x$ will represent the quantity of which the residue is sought.  Similar to the Barrett algorithm the input
-is restricted to $0 \le x < n^2$.  To begin the description some simple number theory facts must be established.
-
-\textbf{Fact 1.}  Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$.  Another way
-to explain this is that $n$ (\textit{or multiples of $n$}) is congruent to zero modulo $n$.  Adding zero will not change the value of the residue.  
-
-\textbf{Fact 2.}  If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$.  Actually
-this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to 
-multiplication by $k^{-1}$ modulo $n$.  
-
-From these two simple facts the following simple algorithm can be derived.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $1$ to $k$ do \\
-\hspace{3mm}1.1  If $x$ is odd then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + n$ \\
-\hspace{3mm}1.2  $x \leftarrow x/2$ \\
-2.  Return $x$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction}
-\end{figure}
-
-The algorithm reduces the input one bit at a time using the two congruencies stated previously.  Inside the loop $n$, which is odd, is
-added to $x$ if $x$ is odd.  This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two.  Since
-$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$.  Let $r$ represent the 
-final result of the Montgomery algorithm.  If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to 
-$0 \le r < \lfloor x/2^k \rfloor + n$.  As a result at most a single subtraction is required to get the residue desired.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\
-\hline $2$ & $x/2 = 1453$ \\
-\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\
-\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\
-\hline $5$ & $x/2 = 278$ \\
-\hline $6$ & $x/2 = 139$ \\
-\hline $7$ & $x + n = 396$, $x/2 = 198$ \\
-\hline $8$ & $x/2 = 99$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (I)}
-\label{fig:MONT1}
-\end{figure}
-
-Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$.  The final result $r = 99$ which is actually
-$2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$ can reveal the residue $x \equiv 158$ by multiplying by $2^8$ modulo $n$.  
-
-Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$.  The current algorithm requires $2k^2$ single precision shifts
-and $k^2$ single precision additions.  At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful.  
-Fortunately there exists an alternative representation of the algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  If the $t$'th bit of $x$ is one then \\
-\hspace{6mm}1.1.1  $x \leftarrow x + 2^tn$ \\
-2.  Return $x/2^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified I)}
-\end{figure}
-
-This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2.  The number of single
-precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement.
-
-\begin{figure}[here]
-\begin{small}
-\begin{center}
-\begin{tabular}{|c|l|}
-\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\
-\hline $1$ & $x + 2^{0}n = 5812$ \\
-\hline $2$ & $5812$ \\
-\hline $3$ & $x + 2^{2}n = 6840$ \\
-\hline $4$ & $x + 2^{3}n = 8896$ \\
-\hline $5$ & $8896$ \\
-\hline $6$ & $8896$ \\
-\hline $7$ & $x + 2^{6}n = 25344$ \\
-\hline $8$ & $25344$ \\
-\hline -- & $x/2^k = 99$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example of Montgomery Reduction (II)}
-\label{fig:MONT2}
-\end{figure}
-
-Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 4093$ modulo $n = 257$ with $k = 8$. 
-With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the 
-loop.  Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed.  In those iterations the $t$'th bit of $x$ is 
-zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero.  
-
-\subsection{Digit Based Montgomery Reduction}
-Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis.  Consider the
-previous algorithm re-written to compute the Montgomery reduction in this new fashion.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\
-\textbf{Input}.   Integer $x$, $n$ and $k$ \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  for $t$ from $0$ to $k - 1$ do \\
-\hspace{3mm}1.1  $x \leftarrow x + \mu n \beta^t$ \\
-2.  Return $x/\beta^k$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Montgomery Reduction (modified II)}
-\end{figure}
-
-The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue.  If the first digit of 
-the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit.  This
-problem breaks down to solving the following congruency.  
-
-\begin{center}
-\begin{tabular}{rcl}
-$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\
-$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\end{tabular}
-\end{center}
-
-In each iteration of the loop on step 1 a new value of $\mu$ must be calculated.  The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used 
-extensively in this algorithm and should be precomputed.  Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$.  
-
-For example, let $\beta = 10$ represent the radix.  Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$.  Let $x = 33$ 
-represent the value to reduce.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|c|}
-\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\
-\hline --                 & $33$ & --\\
-\hline $0$                 & $33 + \mu n = 50$ & $1$ \\
-\hline $1$                 & $50 + \mu n \beta = 900$ & $5$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Montgomery Reduction}
-\end{figure}
-
-The final result $900$ is then divided by $\beta^k$ to produce the final result $9$.  The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ 
-which implies the result is not the modular residue of $x$ modulo $n$.  However, recall that the residue is actually multiplied by $\beta^{-k}$ in
-the algorithm.  To get the true residue the value must be multiplied by $\beta^k$.  In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and
-the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$.  
-
-\subsection{Baseline Montgomery Reduction}
-The baseline Montgomery reduction algorithm will produce the residue for any size input.  It is designed to be a catch-all algororithm for 
-Montgomery reductions.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-1.  $digs \leftarrow 2n.used + 1$ \\
-2.  If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\
-\hspace{3mm}2.1  Use algorithm fast\_mp\_montgomery\_reduce instead. \\
-\\
-Setup $x$ for the reduction. \\
-3.  If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\
-4.  $x.used \leftarrow digs$ \\
-\\
-Eliminate the lower $k$ digits. \\
-5.  For $ix$ from $0$ to $k - 1$ do \\
-\hspace{3mm}5.1  $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}5.2  $u \leftarrow 0$ \\
-\hspace{3mm}5.3  For $iy$ from $0$ to $k - 1$ do \\
-\hspace{6mm}5.3.1  $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\
-\hspace{6mm}5.3.2  $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{6mm}5.3.3  $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-\hspace{3mm}5.4  While $u > 0$ do \\
-\hspace{6mm}5.4.1  $iy \leftarrow iy + 1$ \\
-\hspace{6mm}5.4.2  $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\
-\hspace{6mm}5.4.3  $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\
-\hspace{6mm}5.4.4  $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\
-\\
-Divide by $\beta^k$ and fix up as required. \\
-6.  $x \leftarrow \lfloor x / \beta^k \rfloor$ \\
-7.  If $x \ge n$ then \\
-\hspace{3mm}7.1  $x \leftarrow x - n$ \\
-8.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_reduce.}
-This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm.  The algorithm is loosely based
-on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop.  The
-restrictions on this algorithm are fairly easy to adapt to.  First $0 \le x < n^2$ bounds the input to numbers in the same range as 
-for the Barrett algorithm.  Additionally $n > 1$ will ensure a modular inverse $\rho$ exists.  $\rho$ must be calculated in
-advance of this algorithm.  Finally the variable $k$ is fixed and a pseudonym for $n.used$.  
-
-Step 2 decides whether a faster Montgomery algorithm can be used.  It is based on the Comba technique meaning that there are limits on
-the size of the input.  This algorithm is discussed in sub-section 7.3.3.
-
-Step 5 is the main reduction loop of the algorithm.  The value of $\mu$ is calculated once per iteration in the outer loop.  The inner loop
-calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits.  Both the addition and
-multiplication are performed in the same loop to save time and memory.  Step 5.4 will handle any additional carries that escape the inner loop.
-
-Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications 
-in the inner loop.  In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision
-multiplications.  
-
-\index{bn\_mp\_montgomery\_reduce.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction */
-018   int
-019   mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-020   \{
-021     int     ix, res, digs;
-022     mp_digit mu;
-023   
-024     /* can the fast reduction [comba] method be used?
-025      *
-026      * Note that unlike in mp_mul you're safely allowed *less*
-027      * than the available columns [255 per default] since carries
-028      * are fixed up in the inner loop.
-029      */
-030     digs = n->used * 2 + 1;
-031     if ((digs < MP_WARRAY) && 
-032         n->used < 
-033         (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{
-034       return fast_mp_montgomery_reduce (x, n, rho);
-035     \}
-036   
-037     /* grow the input as required */
-038     if (x->alloc < digs) \{
-039       if ((res = mp_grow (x, digs)) != MP_OKAY) \{
-040         return res;
-041       \}
-042     \}
-043     x->used = digs;
-044   
-045     for (ix = 0; ix < n->used; ix++) \{
-046       /* mu = ai * m' mod b */
-047       mu = (x->dp[ix] * rho) & MP_MASK;
-048   
-049       /* a = a + mu * m * b**i */
-050       \{
-051         register int iy;
-052         register mp_digit *tmpn, *tmpx, u;
-053         register mp_word r;
-054   
-055         /* aliases */
-056         tmpn = n->dp;
-057         tmpx = x->dp + ix;
-058   
-059         /* set the carry to zero */
-060         u = 0;
-061         
-062         /* Multiply and add in place */
-063         for (iy = 0; iy < n->used; iy++) \{
-064           r = ((mp_word) mu) * ((mp_word) * tmpn++) + 
-065               ((mp_word) u) + ((mp_word) * tmpx);
-066           u = (r >> ((mp_word) DIGIT_BIT));
-067           *tmpx++ = (r & ((mp_word) MP_MASK));
-068         \}
-069         /* propagate carries */
-070         while (u) \{
-071           *tmpx   += u;
-072           u        = *tmpx >> DIGIT_BIT;
-073           *tmpx++ &= MP_MASK;
-074         \}
-075       \}
-076     \}
-077   
-078     /* x = x/b**n.used */
-079     mp_clamp(x);
-080     mp_rshd (x, n->used);
-081   
-082     /* if A >= m then A = A - m */
-083     if (mp_cmp_mag (x, n) != MP_LT) \{
-084       return s_mp_sub (x, n, x);
-085     \}
-086   
-087     return MP_OKAY;
-088   \}
-\end{alltt}
-\end{small}
-
-This is the baseline implementation of the Montgomery reduction algorithm.  Lines 30 to 35 determine if the Comba based
-routine can be used instead.  Line 47 computes the value of $\mu$ for that particular iteration of the outer loop.  
-
-The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop.  The alias $tmpx$ refers to the $ix$'th digit of $x$ and
-the alias $tmpn$ refers to the modulus $n$.  
-
-\subsection{Faster ``Comba'' Montgomery Reduction}
-
-The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial
-nature of the inner loop.  The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba
-technique.  The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates
-a $k \times 1$ product $k$ times. 
-
-The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$.  This means the 
-carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit.  The solution as it turns out is very simple.  
-Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry.  
-
-With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases
-the speed of the algorithm.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\
-\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\
-\textbf{Output}.  $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\
-\hline \\
-Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\
-1.  if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\
-Copy the digits of $x$ into the array $\hat W$ \\
-2.  For $ix$ from $0$ to $x.used - 1$ do \\
-\hspace{3mm}2.1  $\hat W_{ix} \leftarrow x_{ix}$ \\
-3.  For $ix$ from $x.used$ to $2n.used - 1$ do \\
-\hspace{3mm}3.1  $\hat W_{ix} \leftarrow 0$ \\
-Elimiate the lower $k$ digits. \\
-4.  for $ix$ from $0$ to $n.used - 1$ do \\
-\hspace{3mm}4.1  $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.2  For $iy$ from $0$ to $n.used - 1$ do \\
-\hspace{6mm}4.2.1  $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\
-\hspace{3mm}4.3  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Propagate carries upwards. \\
-5.  for $ix$ from $n.used$ to $2n.used + 1$ do \\
-\hspace{3mm}5.1  $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\
-Shift right and reduce modulo $\beta$ simultaneously. \\
-6.  for $ix$ from $0$ to $n.used + 1$ do \\
-\hspace{3mm}6.1  $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\
-Zero excess digits and fixup $x$. \\
-7.  if $x.used > n.used + 1$ then do \\
-\hspace{3mm}7.1  for $ix$ from $n.used + 1$ to $x.used - 1$ do \\
-\hspace{6mm}7.1.1  $x_{ix} \leftarrow 0$ \\
-8.  $x.used \leftarrow n.used + 1$ \\
-9.  Clamp excessive digits of $x$. \\
-10.  If $x \ge n$ then \\
-\hspace{3mm}10.1  $x \leftarrow x - n$ \\
-11.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm fast\_mp\_montgomery\_reduce}
-\end{figure}
-
-\textbf{Algorithm fast\_mp\_montgomery\_reduce.}
-This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique.  It is on most computer platforms significantly
-faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}).  The algorithm has the same restrictions
-on the input as the baseline reduction algorithm.  An additional two restrictions are imposed on this algorithm.  The number of digits $k$ in the 
-the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$.   When $\beta = 2^{28}$ this algorithm can be used to reduce modulo
-a modulus of at most $3,556$ bits in length.  
-
-As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product.  It is initially filled with the
-contents of $x$ with the excess digits zeroed.  The reduction loop is very similar the to the baseline loop at heart.  The multiplication on step
-4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$.  Some multipliers such
-as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce.  By performing
-a single precision multiplication instead half the amount of time is spent.
-
-Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work.  That is what step
-4.3 will do.  In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards.  Note
-how the upper bits of those same words are not reduced modulo $\beta$.  This is because those values will be discarded shortly and there is no
-point.
-
-Step 5 will propgate the remainder of the carries upwards.  On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are
-stored in the destination $x$.  
-
-\index{bn\_fast\_mp\_montgomery\_reduce.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* computes xR**-1 == x (mod N) via Montgomery Reduction 
-018    * 
-019    * This is an optimized implementation of mp_montgomery_reduce 
-020    * which uses the comba method to quickly calculate the columns of the
-021    * reduction.  
-022    *
-023    * Based on Algorithm 14.32 on pp.601 of HAC.
-024   */
-025   int
-026   fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho)
-027   \{
-028     int     ix, res, olduse;
-029     mp_word W[MP_WARRAY];
-030   
-031     /* get old used count */
-032     olduse = x->used;
-033   
-034     /* grow a as required */
-035     if (x->alloc < n->used + 1) \{
-036       if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{
-037         return res;
-038       \}
-039     \}
-040   
-041     \{
-042       register mp_word *_W;
-043       register mp_digit *tmpx;
-044   
-045       _W = W;
-046       tmpx = x->dp;
-047   
-048       /* copy the digits of a into W[0..a->used-1] */
-049       for (ix = 0; ix < x->used; ix++) \{
-050         *_W++ = *tmpx++;
-051       \}
-052   
-053       /* zero the high words of W[a->used..m->used*2] */
-054       for (; ix < n->used * 2 + 1; ix++) \{
-055         *_W++ = 0;
-056       \}
-057     \}
-058   
-059     for (ix = 0; ix < n->used; ix++) \{
-060       /* mu = ai * m' mod b
-061        *
-062        * We avoid a double precision multiplication (which isn't required)
-063        * by casting the value down to a mp_digit.  Note this requires 
-064        * that W[ix-1] have  the carry cleared (see after the inner loop)
-065        */
-066       register mp_digit mu;
-067       mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK;
-068   
-069       /* a = a + mu * m * b**i
-070        *
-071        * This is computed in place and on the fly.  The multiplication
-072        * by b**i is handled by offseting which columns the results
-073        * are added to.
-074        *
-075        * Note the comba method normally doesn't handle carries in the 
-076        * inner loop In this case we fix the carry from the previous 
-077        * column since the Montgomery reduction requires digits of the 
-078        * result (so far) [see above] to work.  This is
-079        * handled by fixing up one carry after the inner loop.  The 
-080        * carry fixups are done in order so after these loops the 
-081        * first m->used words of W[] have the carries fixed
-082        */
-083       \{
-084         register int iy;
-085         register mp_digit *tmpn;
-086         register mp_word *_W;
-087   
-088         /* alias for the digits of the modulus */
-089         tmpn = n->dp;
-090   
-091         /* Alias for the columns set by an offset of ix */
-092         _W = W + ix;
-093   
-094         /* inner loop */
-095         for (iy = 0; iy < n->used; iy++) \{
-096             *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++);
-097         \}
-098       \}
-099   
-100       /* now fix carry for next digit, W[ix+1] */
-101       W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT);
-102     \}
-103   
-104   
-105     \{
-106       register mp_digit *tmpx;
-107       register mp_word *_W, *_W1;
-108   
-109       /* nox fix rest of carries */
-110       _W1 = W + ix;
-111       _W = W + ++ix;
-112   
-113       for (; ix <= n->used * 2 + 1; ix++) \{
-114         *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT);
-115       \}
-116   
-117       /* copy out, A = A/b**n
-118        *
-119        * The result is A/b**n but instead of converting from an 
-120        * array of mp_word to mp_digit than calling mp_rshd 
-121        * we just copy them in the right order
-122        */
-123       tmpx = x->dp;
-124       _W = W + n->used;
-125   
-126       for (ix = 0; ix < n->used + 1; ix++) \{
-127         *tmpx++ = *_W++ & ((mp_word) MP_MASK);
-128       \}
-129   
-130       /* zero oldused digits, if the input a was larger than
-131        * m->used+1 we'll have to clear the digits */
-132       for (; ix < olduse; ix++) \{
-133         *tmpx++ = 0;
-134       \}
-135     \}
-136   
-137     /* set the max used and clamp */
-138     x->used = n->used + 1;
-139     mp_clamp (x);
-140   
-141     /* if A >= m then A = A - m */
-142     if (mp_cmp_mag (x, n) != MP_LT) \{
-143       return s_mp_sub (x, n, x);
-144     \}
-145     return MP_OKAY;
-146   \}
-\end{alltt}
-\end{small}
-
-The $\hat W$ array is first filled with digits of $x$ on line 49 then the rest of the digits are zeroed on line 54.  Both loops share
-the same alias variables to make the code easier to read.  
-
-The value of $\mu$ is calculated in an interesting fashion.  First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit.  This
-forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision.   Line 101 fixes the carry 
-for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$.
-
-The for loop on line 113 propagates the rest of the carries upwards through the columns.  The for loop on line 126 reduces the columns
-modulo $\beta$ and shifts them $k$ places at the same time.  The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th
-digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$.  
-
-\subsection{Montgomery Setup}
-To calculate the variable $\rho$ a relatively simple algorithm will be required.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_montgomery\_setup}. \\
-\textbf{Input}.   mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\
-\textbf{Output}.  $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\
-\hline \\
-1.  $b \leftarrow n_0$ \\
-2.  If $b$ is even return(\textit{MP\_VAL}) \\
-3.  $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\
-4.  for $k$ from 0 to $3$ do \\
-\hspace{3mm}4.1  $x \leftarrow x \cdot (2 - bx)$ \\
-5.  $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\
-6.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_montgomery\_setup} 
-\end{figure}
-
-\textbf{Algorithm mp\_montgomery\_setup.}
-This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms.  It uses a very interesting trick 
-to calculate $1/n_0$ when $\beta$ is a power of two.  
-
-\index{bn\_mp\_montgomery\_setup.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* setups the montgomery reduction stuff */
-018   int
-019   mp_montgomery_setup (mp_int * n, mp_digit * rho)
-020   \{
-021     mp_digit x, b;
-022   
-023   /* fast inversion mod 2**k
-024    *
-025    * Based on the fact that
-026    *
-027    * XA = 1 (mod 2**n)  =>  (X(2-XA)) A = 1 (mod 2**2n)
-028    *                    =>  2*X*A - X*X*A*A = 1
-029    *                    =>  2*(1) - (1)     = 1
-030    */
-031     b = n->dp[0];
-032   
-033     if ((b & 1) == 0) \{
-034       return MP_VAL;
-035     \}
-036   
-037     x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */
-038     x *= 2 - b * x;               /* here x*a==1 mod 2**8 */
-039   #if !defined(MP_8BIT)
-040     x *= 2 - b * x;               /* here x*a==1 mod 2**16 */
-041   #endif
-042   #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT))
-043     x *= 2 - b * x;               /* here x*a==1 mod 2**32 */
-044   #endif
-045   #ifdef MP_64BIT
-046     x *= 2 - b * x;               /* here x*a==1 mod 2**64 */
-047   #endif
-048   
-049     /* rho = -1/m mod b */
-050     *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK;
-051   
-052     return MP_OKAY;
-053   \}
-\end{alltt}
-\end{small}
-
-This source code computes the value of $\rho$ required to perform Montgomery reduction.  It has been modified to avoid performing excess
-multiplications when $\beta$ is not the default 28-bits.  
-
-\section{The Diminished Radix Algorithm}
-The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett
-or Montgomery methods for certain forms of moduli.  The technique is based on the following simple congruence.
-
-\begin{equation}
-(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)}
-\end{equation}
-
-This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive.  It used the fact that if $n = 2^{31}$ and $k=1$ that 
-then a x86 multiplier could produce the 62-bit product and use  the ``shrd'' instruction to perform a double-precision right shift.  The proof
-of the above equation is very simple.  First write $x$ in the product form.
-
-\begin{equation}
-x = qn + r
-\end{equation}
-
-Now reduce both sides modulo $(n - k)$.
-
-\begin{equation}
-x \equiv qk + r  \mbox{ (mod }(n-k)\mbox{)}
-\end{equation}
-
-The variable $n$ reduces modulo $n - k$ to $k$.  By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ 
-into the equation the original congruence is reproduced, thus concluding the proof.  The following algorithm is based on this observation.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Diminished Radix Reduction}. \\
-\textbf{Input}.   Integer $x$, $n$, $k$ \\
-\textbf{Output}.  $x \mbox{ mod } (n - k)$ \\
-\hline \\
-1.  $q \leftarrow \lfloor x / n \rfloor$ \\
-2.  $q \leftarrow k \cdot q$ \\
-3.  $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\
-4.  $x \leftarrow x + q$ \\
-5.  If $x \ge (n - k)$ then \\
-\hspace{3mm}5.1  $x \leftarrow x - (n - k)$ \\
-\hspace{3mm}5.2  Goto step 1. \\
-6.  Return $x$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm Diminished Radix Reduction}
-\label{fig:DR}
-\end{figure}
-
-This algorithm will reduce $x$ modulo $n - k$ and return the residue.  If $0 \le x < (n - k)^2$ then the algorithm will loop almost always
-once or twice and occasionally three times.  For simplicity sake the value of $x$ is bounded by the following simple polynomial.
-
-\begin{equation} 
-0 \le x < n^2 + k^2 - 2nk
-\end{equation}
-
-The true bound is  $0 \le x < (n - k - 1)^2$ but this has quite a few more terms.  The value of $q$ after step 1 is bounded by the following.
-
-\begin{equation}
-q < n - 2k - k^2/n
-\end{equation}
-
-Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero.  The value of $x$ after step 3 is bounded trivially as
-$0 \le x < n$.  By step four the sum $x + q$ is bounded by 
-
-\begin{equation}
-0 \le q + x < (k + 1)n - 2k^2 - 1
-\end{equation}
-
-With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3.  After the second pass it is highly unlike that the
-sum in step 4 will exceed $n - k$.  In practice fewer than three passes of the algorithm are required to reduce virtually every input in the 
-range $0 \le x < (n - k - 1)^2$.  
-
-\begin{figure}
-\begin{small}
-\begin{center}
-\begin{tabular}{|l|}
-\hline
-$x = 123456789, n = 256, k = 3$ \\
-\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\
-$q \leftarrow q*k = 1446759$ \\
-$x \leftarrow x \mbox{ mod } n = 21$ \\
-$x \leftarrow x + q = 1446780$ \\
-$x \leftarrow x - (n - k) = 1446527$ \\
-\hline 
-$q \leftarrow \lfloor x/n \rfloor = 5650$ \\
-$q \leftarrow q*k = 16950$ \\
-$x \leftarrow x \mbox{ mod } n = 127$ \\
-$x \leftarrow x + q = 17077$ \\
-$x \leftarrow x - (n - k) = 16824$ \\
-\hline 
-$q \leftarrow \lfloor x/n \rfloor = 65$ \\
-$q \leftarrow q*k = 195$ \\
-$x \leftarrow x \mbox{ mod } n = 184$ \\
-$x \leftarrow x + q = 379$ \\
-$x \leftarrow x - (n - k) = 126$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Example Diminished Radix Reduction}
-\label{fig:EXDR}
-\end{figure}
-
-Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$.  Note that even while $x$
-is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast.  In this case only
-three passes were required to find the residue $x \equiv 126$.
-
-
-\subsection{Choice of Moduli}
-On the surface this algorithm looks like a very expensive algorithm.  It requires a couple of subtractions followed by multiplication and other
-modular reductions.  The usefulness of this algorithm becomes exceedingly clear when an appropriate moduli is chosen.
-
-Division in general is a very expensive operation to perform.  The one exception is when the division is by a power of the radix of representation used.  
-Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right.  Similarly division 
-by two (\textit{or powers of two}) is very simple for binary computers to perform.  It would therefore seem logical to choose $n$ of the form $2^p$ 
-which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits.  
-
-However, there is one operation related to division of power of twos that is even faster than this.  If $n = \beta^p$ then the division may be 
-performed by moving whole digits to the right $p$ places.  In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$.  
-Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ requires zeroing the digits above the $p-1$'th digit of $x$.  
-
-Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ where as the term ``unrestricted
-modulus'' will refer to a modulus of the form $2^p - k$.  The word ``restricted'' in this case refers to the fact that it is based on the 
-$2^p$ logic except $p$ must be a multiple of $lg(\beta)$.  
-
-\subsection{Choice of $k$}
-Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$
-in step 2 is the most expensive operation.  Fortunately the choice of $k$ is not terribly limited.  For all intents and purposes it might
-as well be a single digit.  The smaller the value of $k$ is the faster the algorithm will be.  
-
-\subsection{Restricted Diminished Radix Reduction}
-The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$.  This algorithm can reduce 
-an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}.  The implementation
-of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition 
-of $x$ and $q$.  The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular 
-exponentiations are performed.
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_reduce}. \\
-\textbf{Input}.   mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\
-\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\
-\textbf{Output}.  $x \mbox{ mod } n$ \\
-\hline \\
-1.  $m \leftarrow n.used$ \\
-2.  If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\
-3.  $\mu \leftarrow 0$ \\
-4.  for $i$ from $0$ to $m - 1$ do \\
-\hspace{3mm}4.1  $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\
-\hspace{3mm}4.2  $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\
-\hspace{3mm}4.3  $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\
-5.  $x_{m} \leftarrow \mu$ \\
-6.  for $i$ from $m + 1$ to $x.used - 1$ do \\
-\hspace{3mm}6.1  $x_{i} \leftarrow 0$ \\
-7.  Clamp excess digits of $x$. \\
-8.  If $x \ge n$ then \\
-\hspace{3mm}8.1  $x \leftarrow x - n$ \\
-\hspace{3mm}8.2  Goto step 3. \\
-9.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_reduce}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_reduce.}
-This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$.  It has similar restrictions to that of the Barrett reduction
-with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$.  
-
-This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization.  The division by $\beta^m$, multiplication by $k$
-and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4.  The division by $\beta^m$ is emulated by accessing
-the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position.  After the loop the $m$'th
-digit is set to the carry and the upper digits are zeroed.  Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to 
-$x$ before the addition of the multiple of the upper half.  
-
-At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required.  First $n$ is subtracted from $x$ and then the algorithm resumes
-at step 3.  
-
-\index{bn\_mp\_dr\_reduce.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* reduce "x" in place modulo "n" using the Diminished Radix algorithm.
-018    *
-019    * Based on algorithm from the paper
-020    *
-021    * "Generating Efficient Primes for Discrete Log Cryptosystems"
-022    *                 Chae Hoon Lim, Pil Loong Lee,
-023    *          POSTECH Information Research Laboratories
-024    *
-025    * The modulus must be of a special format [see manual]
-026    *
-027    * Has been modified to use algorithm 7.10 from the LTM book instead
-028    */
-029   int
-030   mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k)
-031   \{
-032     int      err, i, m;
-033     mp_word  r;
-034     mp_digit mu, *tmpx1, *tmpx2;
-035     
-036     /* m = digits in modulus */
-037     m = n->used;
-038     
-039     /* ensure that "x" has at least 2m digits */
-040     if (x->alloc < m + m) \{
-041       if ((err = mp_grow (x, m + m)) != MP_OKAY) \{
-042         return err;
-043       \}
-044     \}
-045   
-046   /* top of loop, this is where the code resumes if 
-047    * another reduction pass is required.
-048    */
-049   top:
-050     /* aliases for digits */
-051     /* alias for lower half of x */
-052     tmpx1 = x->dp;
-053     
-054     /* alias for upper half of x, or x/B**m */
-055     tmpx2 = x->dp + m;
-056     
-057     /* set carry to zero */
-058     mu = 0;
-059     
-060     /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */
-061     for (i = 0; i < m; i++) \{
-062         r         = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu;
-063         *tmpx1++  = r & MP_MASK;
-064         mu        = r >> ((mp_word)DIGIT_BIT);
-065     \}
-066     
-067     /* set final carry */
-068     *tmpx1++ = mu;
-069     
-070     /* zero words above m */
-071     for (i = m + 1; i < x->used; i++) \{
-072         *tmpx1++ = 0;
-073     \}
-074   
-075     /* clamp, sub and return */
-076     mp_clamp (x);
-077   
-078     /* if x >= n then subtract and reduce again 
-079      * Each successive "recursion" makes the input smaller and smaller.
-080      */
-081     if (mp_cmp_mag (x, n) != MP_LT) \{
-082       s_mp_sub(x, n, x);
-083       goto top;
-084     \}
-085     return MP_OKAY;
-086   \}
-\end{alltt}
-\end{small}
-
-The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$.  The label on line 49 is where
-the algorithm will resume if further reduction passes are required.  In theory it could be placed at the top of the function however, the size of
-the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time.  
-
-The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits.  By reading digits from $x$ offset by $m$ digits
-a division by $\beta^m$ can be simulated virtually for free.  The loop on line 61 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11})
-in this algorithm.
-
-By line 68 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed.  Similarly by line 71 the 
-same pointer will point to the $m+1$'th digit where the zeroes will be placed.  
-
-Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required.  
-With the same logic at line 82 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used
-as well.  Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code
-does not need to be checked.
-
-\subsubsection{Setup}
-To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required.  This algorithm is not really complicated but provided for
-completeness.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_setup}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $k = \beta - n_0$ \\
-\hline \\
-1.  $k \leftarrow \beta - n_0$ \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_setup}
-\end{figure}
-
-\index{bn\_mp\_dr\_setup.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* determines the setup value */
-018   void mp_dr_setup(mp_int *a, mp_digit *d)
-019   \{
-020      /* the casts are required if DIGIT_BIT is one less than
-021       * the number of bits in a mp_digit [e.g. DIGIT_BIT==31]
-022       */
-023      *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - 
-024           ((mp_word)a->dp[0]));
-025   \}
-026   
-\end{alltt}
-\end{small}
-
-\subsubsection{Modulus Detection}
-Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus.  An integer is said to be
-of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\
-\textbf{Input}.   mp\_int $n$ \\
-\textbf{Output}.  $1$ if $n$ is in D.R form, $0$ otherwise \\
-\hline
-1.  If $n.used < 2$ then return($0$). \\
-2.  for $ix$ from $1$ to $n.used - 1$ do \\
-\hspace{3mm}2.1  If $n_{ix} \ne \beta - 1$ return($0$). \\
-3.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_dr\_is\_modulus}
-\end{figure}
-
-\textbf{Algorithm mp\_dr\_is\_modulus.}
-This algorithm determines if a value is in Diminished Radix form.  Step 1 rejects obvious cases where fewer than two digits are
-in the mp\_int.  Step 2 tests all but the first digit to see if they are equal to $\beta - 1$.  If the algorithm manages to get to
-step 3 then $n$ must of Diminished Radix form.
-
-\index{bn\_mp\_dr\_is\_modulus.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* determines if a number is a valid DR modulus */
-018   int mp_dr_is_modulus(mp_int *a)
-019   \{
-020      int ix;
-021   
-022      /* must be at least two digits */
-023      if (a->used < 2) \{
-024         return 0;
-025      \}
-026   
-027      for (ix = 1; ix < a->used; ix++) \{
-028          if (a->dp[ix] != MP_MASK) \{
-029             return 0;
-030          \}
-031      \}
-032      return 1;
-033   \}
-034   
-\end{alltt}
-\end{small}
-
-\subsection{Unrestricted Diminished Radix Reduction}
-The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$.  This algorithm
-is a straightforward adaptation of algorithm~\ref{fig:DR}.
-
-In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead.  However, this new
-algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k}. \\
-\textbf{Input}.   mp\_int $a$ and $n$.  mp\_digit $k$  \\
-\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\
-\textbf{Output}.  $a \mbox{ (mod }n\mbox{)}$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  While $a \ge n$ do \\
-\hspace{3mm}2.1  $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\
-\hspace{3mm}2.2  $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\
-\hspace{3mm}2.3  $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\
-\hspace{3mm}2.4  $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\
-\hspace{3mm}2.5  If $a \ge n$ then do \\
-\hspace{6mm}2.5.1  $a \leftarrow a - n$ \\
-3.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k.}
-This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$.  Division by $2^p$ is emulated with a right
-shift which makes the algorithm fairly inexpensive to use.  
-
-\index{bn\_mp\_reduce\_2k.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* reduces a modulo n where n is of the form 2**p - k */
-018   int
-019   mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k)
-020   \{
-021      mp_int q;
-022      int    p, res;
-023      
-024      if ((res = mp_init(&q)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(n);    
-029   top:
-030      /* q = a/2**p, a = a mod 2**p */
-031      if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{
-032         goto ERR;
-033      \}
-034      
-035      if (k != 1) \{
-036         /* q = q * k */
-037         if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) \{ 
-038            goto ERR;
-039         \}
-040      \}
-041      
-042      /* a = a + q */
-043      if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{
-044         goto ERR;
-045      \}
-046      
-047      if (mp_cmp_mag(a, n) != MP_LT) \{
-048         s_mp_sub(a, n, a);
-049         goto top;
-050      \}
-051      
-052   ERR:
-053      mp_clear(&q);
-054      return res;
-055   \}
-056   
-\end{alltt}
-\end{small}
-
-The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$.  The call to mp\_div\_2d
-on line 31 calculates both the quotient $q$ and the remainder $a$ required.  By doing both in a single function call the code size
-is kept fairly small.  The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without
-any multiplications.  
-
-The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are 
-positive.  By using the unsigned versions the overhead is kept to a minimum.  
-
-\subsubsection{Unrestricted Setup}
-To setup this reduction algorithm the value of $k = 2^p - n$ is required.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $k = 2^p - n$ \\
-\hline
-1.  $p \leftarrow \lceil lg(n) \rceil$  (\textit{mp\_count\_bits}) \\
-2.  $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\
-3.  $x \leftarrow x - n$ (\textit{mp\_sub}) \\
-4.  $k \leftarrow x_0$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_2k\_setup}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_2k\_setup.}
-This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k.  By making a temporary variable $x$ equal to $2^p$ a subtraction
-is sufficient to solve for $k$.  Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$.  
-
-\index{bn\_mp\_reduce\_2k\_setup.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* determines the setup value */
-018   int 
-019   mp_reduce_2k_setup(mp_int *a, mp_digit *d)
-020   \{
-021      int res, p;
-022      mp_int tmp;
-023      
-024      if ((res = mp_init(&tmp)) != MP_OKAY) \{
-025         return res;
-026      \}
-027      
-028      p = mp_count_bits(a);
-029      if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{
-030         mp_clear(&tmp);
-031         return res;
-032      \}
-033      
-034      if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{
-035         mp_clear(&tmp);
-036         return res;
-037      \}
-038      
-039      *d = tmp.dp[0];
-040      mp_clear(&tmp);
-041      return MP_OKAY;
-042   \}
-\end{alltt}
-\end{small}
-
-\subsubsection{Unrestricted Detection}
-An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true.
-
-\begin{enumerate}
-\item  The number has only one digit.
-\item  The number has more than one digit and every bit from the $\beta$'th to the most significant is one.
-\end{enumerate}
-
-If either condition is true than there is a power of two namely $2^p$ such that $0 < 2^p - n < \beta$.   If the input is only
-one digit than it will always be of the correct form.  Otherwise all of the bits above the first digit must be one.  This arises from the fact
-that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most
-significant bit.  The resulting sum will be a power of two.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\
-\textbf{Input}.   mp\_int $n$   \\
-\textbf{Output}.  $1$ if of proper form, $0$ otherwise \\
-\hline
-1.  If $n.used = 0$ then return($0$). \\
-2.  If $n.used = 1$ then return($1$). \\
-3.  $p \leftarrow \rceil lg(n) \lceil$  (\textit{mp\_count\_bits}) \\
-4.  for $x$ from $lg(\beta)$ to $p$ do \\
-\hspace{3mm}4.1  If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\
-5.  Return($1$). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_reduce\_is\_2k}
-\end{figure}
-
-\textbf{Algorithm mp\_reduce\_is\_2k.}
-This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly.  
-
-\index{bn\_mp\_reduce\_is\_2k.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* determines if mp_reduce_2k can be used */
-018   int 
-019   mp_reduce_is_2k(mp_int *a)
-020   \{
-021      int ix, iy;
-022      
-023      if (a->used == 0) \{
-024         return 0;
-025      \} else if (a->used == 1) \{
-026         return 1;
-027      \} else if (a->used > 1) \{
-028         iy = mp_count_bits(a);
-029         for (ix = DIGIT_BIT; ix < iy; ix++) \{
-030             if ((a->dp[ix/DIGIT_BIT] & 
-031                 ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) \{
-032                return 0;
-033             \}
-034         \}
-035      \}
-036      return 1;
-037   \}
-038   
-\end{alltt}
-\end{small}
-
-
-
-\section{Algorithm Comparison}
-So far three very different algorithms for modular reduction have been discussed.  Each of the algorithms have their own strengths and weaknesses
-that makes having such a selection very useful.  The following table sumarizes the three algorithms along with comparisons of work factors.  Since
-all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table.  
-
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\
-\hline Barrett    & $m^2 + 2m - 1$ & None              & $79$ & $1087$ & $4223$ \\
-\hline Montgomery & $m^2 + m$      & $n$ must be odd   & $72$ & $1056$ & $4160$ \\
-\hline D.R.       & $2m$           & $n = \beta^m - k$ & $16$ & $64$   & $128$  \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-
-In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete.  However, in practice since Montgomery
-reduction can be written as a single function with the Comba technique it is much faster.  Barrett reduction suffers from the overhead of
-calling the half precision multipliers, addition and division by $\beta$ algorithms.
-
-For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice.  The one set of algorithms where Diminished Radix reduction truly
-shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}.  In these algorithms
-primes of the form $\beta^m - k$ can be found and shared amongst users.  These primes will allow the Diminished Radix algorithm to be used in
-modular exponentiation to greatly speed up the operation.
-
-
-
-\section*{Exercises}
-\begin{tabular}{cl}
-$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\
-                     & calculates the correct value of $\rho$. \\
-                     & \\
-$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly.  \\
-                     & \\
-$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\
-                     & (\textit{figure~\ref{fig:DR}}) terminates.  Also prove the probability that it will \\
-                     & terminate within $1 \le k \le 10$ iterations. \\
-                     & \\
-\end{tabular}                     
-
-
-\chapter{Exponentiation}
-Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$.  A variant of exponentiation, computed
-in a finite field or ring, is called modular exponentiation.  This latter style of operation is typically used in public key 
-cryptosystems such as RSA and Diffie-Hellman.  The ability to quickly compute modular exponentiations is of great benefit to any
-such cryptosystem and many methods have been sought to speed it up.
-
-\section{Exponentiation Basics}
-A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired.  However, as $b$ grows in size
-the number of multiplications becomes prohibitive.  Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature
-with a $1024$-bit key.  Such a calculation could never be completed as it would take simply far too long.
-
-Fortunately there is a very simple algorithm based on the laws of exponents.  Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which
-are two trivial relationships between the base and the exponent.  Let $b_i$ represent the $i$'th bit of $b$ starting from the least 
-significant bit.  If $b$ is a $k$-bit integer than the following equation is true.
-
-\begin{equation}
-a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i}
-\end{equation}
-
-By taking the base $a$ logarithm of both sides of the equation the following equation is the result.
-
-\begin{equation}
-b = \sum_{i=0}^{k-1}2^i \cdot b_i
-\end{equation}
-
-The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to
-$a^{2^{i+1}}$.  This observation forms the basis of essentially all fast exponentiation algorithms.  It requires $k$ squarings and on average
-$k \over 2$ multiplications to compute the result.  This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times.
-
-While this current method is a considerable speed up there are further improvements to be made.  For example, the $a^{2^i}$ term does not need to 
-be computed in an auxilary variable.  Consider the following equivalent algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Left to Right Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$ and $k$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $k - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  $c \leftarrow c \cdot a^{b_i}$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Left to Right Exponentiation}
-\label{fig:LTOR}
-\end{figure}
-
-This algorithm starts from the most significant bit and works towards the least significant bit.  When the $i$'th bit of $b$ is set $a$ is
-multiplied against the current product.  In each iteration the product is squared which doubles the exponent of the individual terms of the
-product.  
-
-For example, let $b = 101100_2 \equiv 44_{10}$.  The following chart demonstrates the actions of the algorithm.
-
-\newpage\begin{figure}
-\begin{center}
-\begin{tabular}{|c|c|}
-\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\
-\hline - & $1$ \\
-\hline $5$ & $a$ \\
-\hline $4$ & $a^2$ \\
-\hline $3$ & $a^4 \cdot a$ \\
-\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\
-\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\
-\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\
-\hline
-\end{tabular}
-\end{center}
-\caption{Example of Left to Right Exponentiation}
-\end{figure}
-
-When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation.  This particular algorithm is 
-called ``Left to Right'' because it reads the exponent in that order.  All of the exponentiation algorithms that will be presented are of this nature.  
-
-\subsection{Single Digit Exponentiation}
-The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit.  It is intended 
-to be used when a small power of an input is required (\textit{e.g. $a^5$}).  It is faster than simply multiplying $b - 1$ times for all values of 
-$b$ that are greater than three.  
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_expt\_d}. \\
-\textbf{Input}.   mp\_int $a$ and mp\_digit $b$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $g \leftarrow a$ (\textit{mp\_init\_copy}) \\
-2.  $c \leftarrow 1$ (\textit{mp\_set}) \\
-3.  for $x$ from 1 to $lg(\beta)$ do \\
-\hspace{3mm}3.1  $c \leftarrow c^2$ (\textit{mp\_sqr}) \\
-\hspace{3mm}3.2  If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\
-\hspace{6mm}3.2.1  $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\
-\hspace{3mm}3.3  $b \leftarrow b << 1$ \\
-4.  Clear $g$. \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_expt\_d}
-\end{figure}
-
-\textbf{Algorithm mp\_expt\_d.}
-This algorithm computes the value of $a$ raised to the power of a single digit $b$.  It uses the left to right exponentiation algorithm to
-quickly compute the exponentiation.  It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the 
-exponent is a fixed width.  
-
-A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$.  The result is set to the initial value of 
-$1$ in the subsequent step.
-
-Inside the loop the exponent is read from the most significant bit first down to the least significant bit.  First $c$ is invariably squared
-on step 3.1.  In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$.  The value
-of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit.  In effect each
-iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location.
-
-\index{bn\_mp\_expt\_d.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* calculate c = a**b  using a square-multiply algorithm */
-018   int
-019   mp_expt_d (mp_int * a, mp_digit b, mp_int * c)
-020   \{
-021     int     res, x;
-022     mp_int  g;
-023   
-024     if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{
-025       return res;
-026     \}
-027   
-028     /* set initial result */
-029     mp_set (c, 1);
-030   
-031     for (x = 0; x < (int) DIGIT_BIT; x++) \{
-032       /* square */
-033       if ((res = mp_sqr (c, c)) != MP_OKAY) \{
-034         mp_clear (&g);
-035         return res;
-036       \}
-037   
-038       /* if the bit is set multiply */
-039       if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{
-040         if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{
-041            mp_clear (&g);
-042            return res;
-043         \}
-044       \}
-045   
-046       /* shift to next bit */
-047       b <<= 1;
-048     \}
-049   
-050     mp_clear (&g);
-051     return MP_OKAY;
-052   \}
-\end{alltt}
-\end{small}
-
--- Some note later.
-
-\section{$k$-ary Exponentiation}
-When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor
-slower than squaring.  Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$.  Suppose instead it referred to
-the $i$'th $k$-bit digit of the exponent of $b$.  For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY}
-computes the same exponentiation.  A group of $k$ bits from the exponent is called a \textit{window}.  That is it is a small window on only a
-portion of the entire exponent.  Consider the following modification to the basic left to right exponentiation algorithm.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{$k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  $c \leftarrow c^{2^k} $ \\
-\hspace{3mm}2.2  Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\
-\hspace{3mm}2.3  $c \leftarrow c \cdot a^g$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{$k$-ary Exponentiation}
-\label{fig:KARY}
-\end{figure}
-
-The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times.  If the values of $a^g$ for $0 < g < 2^k$ have been
-precomputed this algorithm requires only $t$ multiplications and $tk$ squarings.  The table can be generated with $2^{k - 1} - 1$ squarings and
-$2^{k - 1} + 1$ multiplications.  This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$.  
-However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}.
-
-Suppose $k = 4$ and $t = 100$.  This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation.  The
-original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value.  The total number of squarings
-has increased slightly but the number of multiplications has nearly halved.
-
-\subsection{Optimal Values of $k$}
-An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$.  The simplest
-approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result.  Table~\ref{fig:OPTK} lists optimal values of $k$
-for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\
-\hline $16$ & $2$ & $27$ & $24$ \\
-\hline $32$ & $3$ & $49$ & $48$ \\
-\hline $64$ & $3$ & $92$ & $96$ \\
-\hline $128$ & $4$ & $175$ & $192$ \\
-\hline $256$ & $4$ & $335$ & $384$ \\
-\hline $512$ & $5$ & $645$ & $768$ \\
-\hline $1024$ & $6$ & $1257$ & $1536$ \\
-\hline $2048$ & $6$ & $2452$ & $3072$ \\
-\hline $4096$ & $7$ & $4808$ & $6144$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for $k$-ary Exponentiation}
-\label{fig:OPTK}
-\end{figure}
-
-\subsection{Sliding-Window Exponentiation}
-A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$.  Essentially
-this is a table for all values of $g$ where the most significant bit of $g$ is a one.  However, in order for this to be allowed in the 
-algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided.  
-
-Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}.  
-
-\begin{figure}[here]
-\begin{center}
-\begin{small}
-\begin{tabular}{|c|c|c|c|c|c|}
-\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\
-\hline $16$ & $3$ & $24$ & $27$ \\
-\hline $32$ & $3$ & $45$ & $49$ \\
-\hline $64$ & $4$ & $87$ & $92$ \\
-\hline $128$ & $4$ & $167$ & $175$ \\
-\hline $256$ & $5$ & $322$ & $335$ \\
-\hline $512$ & $6$ & $628$ & $645$ \\
-\hline $1024$ & $6$ & $1225$ & $1257$ \\
-\hline $2048$ & $7$ & $2403$ & $2452$ \\
-\hline $4096$ & $8$ & $4735$ & $4808$ \\
-\hline
-\end{tabular}
-\end{small}
-\end{center}
-\caption{Optimal Values of $k$ for Sliding Window Exponentiation}
-\label{fig:OPTK2}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\
-\textbf{Input}.   Integer $a$, $b$, $k$ and $t$ \\
-\textbf{Output}.  $c = a^b$ \\
-\hline \\
-1.  $c \leftarrow 1$ \\
-2.  for $i$ from $t - 1$ to $0$ do \\
-\hspace{3mm}2.1  If the $i$'th bit of $b$ is a zero then \\
-\hspace{6mm}2.1.1   $c \leftarrow c^2$ \\
-\hspace{3mm}2.2  else do \\
-\hspace{6mm}2.2.1  $c \leftarrow c^{2^k}$ \\
-\hspace{6mm}2.2.2  Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\
-\hspace{6mm}2.2.3  $c \leftarrow c \cdot a^g$ \\
-\hspace{6mm}2.2.4  $i \leftarrow i - k$ \\
-3.  Return $c$. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Sliding Window $k$-ary Exponentiation}
-\end{figure}
-
-Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent.  While this
-algorithm requires the same number of squarings it can potentially have fewer multiplications.  The pre-computed table $a^g$ is also half
-the size as the previous table.  
-
-Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms.  The first algorithm will divide the exponent up as 
-the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$.  The second algorithm will break the 
-exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$.  The single digit $0$ in the second representation are where
-a single squaring took place instead of a squaring and multiplication.  In total the first method requires $10$ multiplications and $18$ 
-squarings.  The second method requires $8$ multiplications and $18$ squarings.  
-
-In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster.  
-
-\section{Modular Exponentiation}
-
-Modular exponentiation is essentially computing the power of a base within a finite field or ring.  For example, computing 
-$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation.  Instead of first computing $a^b$ and then reducing it 
-modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation.  
-
-This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using
-one of the algorithms presented in chapter seven.  
-
-Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first.  This algorithm
-will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The
-value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see section 10.4}).  If no inverse exists the algorithm
-terminates with an error.  
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\
-2.  If $b.sign = MP\_NEG$ then \\
-\hspace{3mm}2.1  $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\
-\hspace{3mm}2.2  $x' \leftarrow \vert x \vert$ \\
-\hspace{3mm}2.3  Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\
-3.  if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\
-\hspace{3mm}3.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\
-4.  else \\
-\hspace{3mm}4.1  Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_exptmod}
-\end{figure}
-
-\textbf{Algorithm mp\_exptmod.}
-The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod.  It is a sliding window $k$-ary algorithm 
-which uses Barrett reduction to reduce the product modulo $p$.  The second algorithm mp\_exptmod\_fast performs the same operation 
-except it uses either Montgomery or Diminished Radix reduction.  The two latter reduction algorithms are clumped in the same exponentiation
-algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}).  
-
-\index{bn\_mp\_exptmod.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   
-018   /* this is a shell function that calls either the normal or Montgomery
-019    * exptmod functions.  Originally the call to the montgomery code was
-020    * embedded in the normal function but that wasted alot of stack space
-021    * for nothing (since 99% of the time the Montgomery code would be called)
-022    */
-023   int
-024   mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-025   \{
-026     int dr;
-027   
-028     /* modulus P must be positive */
-029     if (P->sign == MP_NEG) \{
-030        return MP_VAL;
-031     \}
-032   
-033     /* if exponent X is negative we have to recurse */
-034     if (X->sign == MP_NEG) \{
-035        mp_int tmpG, tmpX;
-036        int err;
-037   
-038        /* first compute 1/G mod P */
-039        if ((err = mp_init(&tmpG)) != MP_OKAY) \{
-040           return err;
-041        \}
-042        if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{
-043           mp_clear(&tmpG);
-044           return err;
-045        \}
-046   
-047        /* now get |X| */
-048        if ((err = mp_init(&tmpX)) != MP_OKAY) \{
-049           mp_clear(&tmpG);
-050           return err;
-051        \}
-052        if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{
-053           mp_clear_multi(&tmpG, &tmpX, NULL);
-054           return err;
-055        \}
-056   
-057        /* and now compute (1/G)**|X| instead of G**X [X < 0] */
-058        err = mp_exptmod(&tmpG, &tmpX, P, Y);
-059        mp_clear_multi(&tmpG, &tmpX, NULL);
-060        return err;
-061     \}
-062   
-063     dr = mp_dr_is_modulus(P);
-064     if (dr == 0) \{
-065        dr = mp_reduce_is_2k(P) << 1;
-066     \}
-067       
-068     /* if the modulus is odd or dr != 0 use the fast method */
-069     if (mp_isodd (P) == 1 || dr !=  0) \{
-070       return mp_exptmod_fast (G, X, P, Y, dr);
-071     \} else \{
-072       return s_mp_exptmod (G, X, P, Y);
-073     \}
-074   \}
-075   
-\end{alltt}
-\end{small}
-
-In order to keep the algorithms in a known state the first step on line 29 is to reject any negative modulus as input.  If the exponent is
-negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$.  The temporary variable $tmpG$ is assigned
-the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$.  The algorithm will recuse with these new values with a positive
-exponent.
-
-If the exponent is positive the algorithm resumes the exponentiation.  Line 63 determines if the modulus is of the restricted Diminished Radix 
-form.  If it is not line 65 attempts to determine if it is of a unrestricted Diminished Radix form.  The integer $dr$ will take on one
-of three values.
-
-\begin{enumerate}
-\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form.
-\item $dr = 1$ means that the modulus is of restricted Diminished Radix form.
-\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form.
-\end{enumerate}
-
-Line 69 determines if the fast modular exponentiation algorithm can be used.  It is allowed if $dr \ne 0$ or if the modulus is odd.  Otherwise,
-the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction.  
-
-\subsection{Barrett Modular Exponentiation}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod}. \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-1.  $k \leftarrow lg(x)$ \\
-2.  $winsize \leftarrow  \left \lbrace \begin{array}{ll}
-                              2 &  \mbox{if }k \le 7 \\
-                              3 &  \mbox{if }7 < k \le 36 \\
-                              4 &  \mbox{if }36 < k \le 140 \\
-                              5 &  \mbox{if }140 < k \le 450 \\
-                              6 &  \mbox{if }450 < k \le 1303 \\
-                              7 &  \mbox{if }1303 < k \le 3529 \\
-                              8 &  \mbox{if }3529 < k \\
-                              \end{array} \right .$ \\
-3.  Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\
-4.  Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\
-5.  $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\
-\\
-Setup the table of small powers of $g$.  First find $g^{2^{winsize}}$ and then all multiples of it. \\
-6.  $k \leftarrow 2^{winsize - 1}$ \\
-7.  $M_{k} \leftarrow M_1$ \\
-8.  for $ix$ from 0 to $winsize - 2$ do \\
-\hspace{3mm}8.1  $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr})  \\
-\hspace{3mm}8.2  $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-9.  for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\
-\hspace{3mm}9.1  $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\
-\hspace{3mm}9.2  $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\
-10.  $res \leftarrow 1$ \\
-\\
-Start Sliding Window. \\
-11.  $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\
-12.  Loop \\
-\hspace{3mm}12.1  $bitcnt \leftarrow bitcnt - 1$ \\
-\hspace{3mm}12.2  If $bitcnt = 0$ then do \\
-\hspace{6mm}12.2.1  If $digidx = -1$ goto step 13. \\
-\hspace{6mm}12.2.2  $buf \leftarrow x_{digidx}$ \\
-\hspace{6mm}12.2.3  $digidx \leftarrow digidx - 1$ \\
-\hspace{6mm}12.2.4  $bitcnt \leftarrow lg(\beta)$ \\
-Continued on next page. \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod}
-\end{figure}
-
-\newpage\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\
-\textbf{Input}.   mp\_int $a$, $b$ and $c$ \\
-\textbf{Output}.  $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\
-\hline \\
-\hspace{3mm}12.3  $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\
-\hspace{3mm}12.4  $buf \leftarrow buf << 1$ \\
-\hspace{3mm}12.5  if $mode = 0$ and $y = 0$ then goto step 12. \\
-\hspace{3mm}12.6  if $mode = 1$ and $y = 0$ then do \\
-\hspace{6mm}12.6.1  $res \leftarrow res^2$ \\
-\hspace{6mm}12.6.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.6.3  Goto step 12. \\
-\hspace{3mm}12.7  $bitcpy \leftarrow bitcpy + 1$ \\
-\hspace{3mm}12.8  $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\
-\hspace{3mm}12.9  $mode \leftarrow 2$ \\
-\hspace{3mm}12.10  If $bitcpy = winsize$ then do \\
-\hspace{6mm}Window is full so perform the squarings and single multiplication. \\
-\hspace{6mm}12.10.1  for $ix$ from $0$ to $winsize -1$ do \\
-\hspace{9mm}12.10.1.1  $res \leftarrow res^2$ \\
-\hspace{9mm}12.10.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}12.10.2  $res \leftarrow res \cdot M_{bitbuf}$ \\
-\hspace{6mm}12.10.3  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}Reset the window. \\
-\hspace{6mm}12.10.4  $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\
-\\
-No more windows left.  Check for residual bits of exponent. \\
-13.  If $mode = 2$ and $bitcpy > 0$ then do \\
-\hspace{3mm}13.1  for $ix$ form $0$ to $bitcpy - 1$ do \\
-\hspace{6mm}13.1.1  $res \leftarrow res^2$ \\
-\hspace{6mm}13.1.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-\hspace{6mm}13.1.3  $bitbuf \leftarrow bitbuf << 1$ \\
-\hspace{6mm}13.1.4  If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\
-\hspace{9mm}13.1.4.1  $res \leftarrow res \cdot M_{1}$ \\
-\hspace{9mm}13.1.4.2  $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\
-14.  $y \leftarrow res$ \\
-15.  Clear $res$, $mu$ and the $M$ array. \\
-16.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm s\_mp\_exptmod (continued)}
-\end{figure}
-
-\textbf{Algorithm s\_mp\_exptmod.}
-This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$.  It takes advantage of the Barrett reduction
-algorithm to keep the product small throughout the algorithm.
-
-The first two steps determine the optimal window size based on the number of bits in the exponent.  The larger the exponent the 
-larger the window size becomes.  After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated.  This
-table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$.  
-
-After the table is allocated the first power of $g$ is found.  Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make
-the rest of the algorithm more efficient.  The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$
-times.  The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$.
-
-Now that the table is available the sliding window may begin.  The following list describes the functions of all the variables in the window.
-\begin{enumerate}
-\item The variable $mode$ dictates how the bits of the exponent are interpreted.  
-\begin{enumerate}
-   \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet.  For example, if the exponent were simply 
-         $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit.  In this case bits are ignored until a non-zero bit is found.  
-   \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet.  In this mode leading $0$ bits 
-         are read and a single squaring is performed.  If a non-zero bit is read a new window is created.  
-   \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit
-         downwards.
-\end{enumerate}
-\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read.  When it reaches zero a new digit
-      is fetched from the exponent.
-\item The variable $buf$ holds the currently read digit of the exponent. 
-\item The variable $digidx$ is an index into the exponents digits.  It starts at the leading digit $x.used - 1$ and moves towards the trailing digit.
-\item The variable $bitcpy$ indicates how many bits are in the currently formed window.  When it reaches $winsize$ the window is flushed and
-      the appropriate operations performed.
-\item The variable $bitbuf$ holds the current bits of the window being formed.  
-\end{enumerate}
-
-All of step 12 is the window processing loop.  It will iterate while there are digits available form the exponent to read.  The first step
-inside this loop is to extract a new digit if no more bits are available in the current digit.  If there are no bits left a new digit is
-read and if there are no digits left than the loop terminates.  
-
-After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit
-upwards.  In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to 
-trailing edges the entire exponent is read from most significant bit to least significant bit.
-
-At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read.  This prevents the 
-algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read.  Step 12.6 and 12.7-10 handle
-the two cases of $mode = 1$ and $mode = 2$ respectively.  
-
-\begin{center}
-\begin{figure}[here]
-\includegraphics{pics/expt_state}
-\caption{Sliding Window State Diagram}
-\end{figure}
-\end{center}
-
-By step 13 there are no more digits left in the exponent.  However, there may be partial bits in the window left.  If $mode = 2$ then 
-a Left-to-Right algorithm is used to process the remaining few bits.  
-
-\index{bn\_s\_mp\_exptmod.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   int
-018   s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y)
-019   \{
-020     mp_int  M[256], res, mu;
-021     mp_digit buf;
-022     int     err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize;
-023   
-024     /* find window size */
-025     x = mp_count_bits (X);
-026     if (x <= 7) \{
-027       winsize = 2;
-028     \} else if (x <= 36) \{
-029       winsize = 3;
-030     \} else if (x <= 140) \{
-031       winsize = 4;
-032     \} else if (x <= 450) \{
-033       winsize = 5;
-034     \} else if (x <= 1303) \{
-035       winsize = 6;
-036     \} else if (x <= 3529) \{
-037       winsize = 7;
-038     \} else \{
-039       winsize = 8;
-040     \}
-041   
-042   #ifdef MP_LOW_MEM
-043       if (winsize > 5) \{
-044          winsize = 5;
-045       \}
-046   #endif
-047   
-048     /* init M array */
-049     for (x = 0; x < (1 << winsize); x++) \{
-050       if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) \{
-051         for (y = 0; y < x; y++) \{
-052           mp_clear (&M[y]);
-053         \}
-054         return err;
-055       \}
-056     \}
-057   
-058     /* create mu, used for Barrett reduction */
-059     if ((err = mp_init (&mu)) != MP_OKAY) \{
-060       goto __M;
-061     \}
-062     if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{
-063       goto __MU;
-064     \}
-065   
-066     /* create M table
-067      *
-068      * The M table contains powers of the base, 
-069      * e.g. M[x] = G**x mod P
-070      *
-071      * The first half of the table is not 
-072      * computed though accept for M[0] and M[1]
-073      */
-074     if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{
-075       goto __MU;
-076     \}
-077   
-078     /* compute the value at M[1<<(winsize-1)] by squaring 
-079      * M[1] (winsize-1) times 
-080      */
-081     if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{
-082       goto __MU;
-083     \}
-084   
-085     for (x = 0; x < (winsize - 1); x++) \{
-086       if ((err = mp_sqr (&M[1 << (winsize - 1)], 
-087                          &M[1 << (winsize - 1)])) != MP_OKAY) \{
-088         goto __MU;
-089       \}
-090       if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{
-091         goto __MU;
-092       \}
-093     \}
-094   
-095     /* create upper table */
-096     for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{
-097       if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{
-098         goto __MU;
-099       \}
-100       if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{
-101         goto __MU;
-102       \}
-103     \}
-104   
-105     /* setup result */
-106     if ((err = mp_init (&res)) != MP_OKAY) \{
-107       goto __MU;
-108     \}
-109     mp_set (&res, 1);
-110   
-111     /* set initial mode and bit cnt */
-112     mode   = 0;
-113     bitcnt = 1;
-114     buf    = 0;
-115     digidx = X->used - 1;
-116     bitcpy = 0;
-117     bitbuf = 0;
-118   
-119     for (;;) \{
-120       /* grab next digit as required */
-121       if (--bitcnt == 0) \{
-122         if (digidx == -1) \{
-123           break;
-124         \}
-125         buf = X->dp[digidx--];
-126         bitcnt = (int) DIGIT_BIT;
-127       \}
-128   
-129       /* grab the next msb from the exponent */
-130       y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1;
-131       buf <<= (mp_digit)1;
-132   
-133       /* if the bit is zero and mode == 0 then we ignore it
-134        * These represent the leading zero bits before the first 1 bit
-135        * in the exponent.  Technically this opt is not required but it
-136        * does lower the # of trivial squaring/reductions used
-137        */
-138       if (mode == 0 && y == 0)
-139         continue;
-140   
-141       /* if the bit is zero and mode == 1 then we square */
-142       if (mode == 1 && y == 0) \{
-143         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-144           goto __RES;
-145         \}
-146         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-147           goto __RES;
-148         \}
-149         continue;
-150       \}
-151   
-152       /* else we add it to the window */
-153       bitbuf |= (y << (winsize - ++bitcpy));
-154       mode = 2;
-155   
-156       if (bitcpy == winsize) \{
-157         /* ok window is filled so square as required and multiply  */
-158         /* square first */
-159         for (x = 0; x < winsize; x++) \{
-160           if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-161             goto __RES;
-162           \}
-163           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-164             goto __RES;
-165           \}
-166         \}
-167   
-168         /* then multiply */
-169         if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{
-170           goto __MU;
-171         \}
-172         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-173           goto __MU;
-174         \}
-175   
-176         /* empty window and reset */
-177         bitcpy = 0;
-178         bitbuf = 0;
-179         mode = 1;
-180       \}
-181     \}
-182   
-183     /* if bits remain then square/multiply */
-184     if (mode == 2 && bitcpy > 0) \{
-185       /* square then multiply if the bit is set */
-186       for (x = 0; x < bitcpy; x++) \{
-187         if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{
-188           goto __RES;
-189         \}
-190         if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-191           goto __RES;
-192         \}
-193   
-194         bitbuf <<= 1;
-195         if ((bitbuf & (1 << winsize)) != 0) \{
-196           /* then multiply */
-197           if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{
-198             goto __RES;
-199           \}
-200           if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{
-201             goto __RES;
-202           \}
-203         \}
-204       \}
-205     \}
-206   
-207     mp_exch (&res, Y);
-208     err = MP_OKAY;
-209   __RES:mp_clear (&res);
-210   __MU:mp_clear (&mu);
-211   __M:
-212     for (x = 0; x < (1 << winsize); x++) \{
-213       mp_clear (&M[x]);
-214     \}
-215     return err;
-216   \}
-\end{alltt}
-\end{small}
-
-Lines 26 through 40 determine the optimal window size based on the length of the exponent in bits.  The window divisions are sorted
-from smallest to greatest so that in each \textbf{if} statement only one condition must be tested.  For example, by the \textbf{if} statement 
-on line 32 the value of $x$ is already known to be greater than $140$.  
-
-The conditional piece of code beginning on line @42,define@ allows the window size to be restricted to five bits.  This logic is used to ensure
-the table of precomputed powers of $G$ remains relatively small.  
-
-The for loop on line 49 initializes the $M$ array while lines 59 and 62 compute the value of $\mu$ required for
-Barrett reduction.  
-
--- More later.
-
-\section{Quick Power of Two}
-Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms.  Recall that a logical shift left $m << k$ is
-equivalent to $m \cdot 2^k$.  By this logic when $m = 1$ a quick power of two can be achieved.
-
-\begin{figure}[!here]
-\begin{small}
-\begin{center}
-\begin{tabular}{l}
-\hline Algorithm \textbf{mp\_2expt}. \\
-\textbf{Input}.   integer $b$ \\
-\textbf{Output}.  $a \leftarrow 2^b$ \\
-\hline \\
-1.  $a \leftarrow 0$ \\
-2.  If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\
-3.  $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\
-4.  $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\
-5.  Return(\textit{MP\_OKAY}). \\
-\hline
-\end{tabular}
-\end{center}
-\end{small}
-\caption{Algorithm mp\_2expt}
-\end{figure}
-
-\textbf{Algorithm mp\_2expt.}
-
-\index{bn\_mp\_2expt.c}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c
-\vspace{-3mm}
-\begin{alltt}
-016   
-017   /* computes a = 2**b 
-018    *
-019    * Simple algorithm which zeroes the int, grows it then just sets one bit
-020    * as required.
-021    */
-022   int
-023   mp_2expt (mp_int * a, int b)
-024   \{
-025     int     res;
-026   
-027     mp_zero (a);
-028     if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{
-029       return res;
-030     \}
-031     a->used = b / DIGIT_BIT + 1;
-032     a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT);
-033   
-034     return MP_OKAY;
-035   \}
-\end{alltt}
-\end{small}
-
-\chapter{Higher Level Algorithms}
-
-This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package.  These
-routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important.  
-
-The first section describes a method of integer division with remainder that is universally well known.  It provides the signed division logic
-for the package.  The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations.  
-These algorithms serve mostly to simplify other algorithms where small constants are required.  The last two sections discuss how to manipulate 
-various representations of integers.  For example, converting from an mp\_int to a string of character.
-
-\section{Integer Division with Remainder}
-
-Integer division aside from modular exponentiation is most intensive algorithm to compute.  
-
-
-\section{Single Digit Helpers}
-\subsection{Single Digit Addition}
-\subsection{Single Digit Subtraction}
-\subsection{Single Digit Multiplication}
-\subsection{Single Digit Division}
-\subsection{Single Digit Modulo}
-\subsection{Single Digit Root Extraction}
-\section{Random Number Generation}
-\section{Formatted Output}
-\subsection{Getting The Output Size}
-\subsection{Generating Radix-n Output}
-\subsection{Reading Radix-n Input}
-\section{Unformatted Output}
-\subsection{Getting The Output Size}
-\subsection{Generating Output}
-\subsection{Reading Input}
-
-\chapter{Number Theoretic Algorithms}
-\section{Greatest Common Divisor}
-\section{Least Common Multiple}
-\section{Jacobi Symbol Computation}
-\section{Modular Inverse}
-\subsection{General Case}
-\subsection{Odd Moduli}
-\section{Primality Tests}
-\subsection{Trial Division}
-\subsection{The Fermat Test}
-\subsection{The Miller-Rabin Test}
-\subsection{Primality Test in a Bottle}
-\subsection{The Next Prime}
-\section{Root Extraction}
-
-\backmatter
-\appendix
-\begin{thebibliography}{ABCDEF}
-\bibitem[1]{TAOCPV2}
-Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998
-
-\bibitem[2]{HAC}
-A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996
-
-\bibitem[3]{ROSE}
-Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999
-
-\bibitem[4]{COMBA}
-Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990)
-
-\bibitem[5]{KARA}
-A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294
-
-\bibitem[6]{KARAP}
-Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002
-
-\bibitem[7]{BARRETT}
-Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag.
-
-\bibitem[8]{MONT}
-P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985.
-
-\bibitem[9]{DRMET}
-Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories
-
-\bibitem[10]{MMB}
-J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89
-
-\end{thebibliography}
-
-\input{tommath.ind}
-
-\chapter{Appendix}
-\subsection*{Appendix A -- Source Listing of tommath.h}
-
-The following is the source listing of the header file ``tommath.h'' for the LibTomMath project.  It contains many of 
-the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on.  The header is 
-presented here for completeness.
-
-\index{tommath.h}
-\vspace{+3mm}\begin{small}
-\hspace{-5.1mm}{\bf File}: tommath.h
-\vspace{-3mm}
-\begin{alltt}
-001   /* LibTomMath, multiple-precision integer library -- Tom St Denis
-002    *
-003    * LibTomMath is library that provides for multiple-precision
-004    * integer arithmetic as well as number theoretic functionality.
-005    *
-006    * The library is designed directly after the MPI library by
-007    * Michael Fromberger but has been written from scratch with
-008    * additional optimizations in place.
-009    *
-010    * The library is free for all purposes without any express
-011    * guarantee it works.
-012    *
-013    * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org
-014    */
-015   #ifndef BN_H_
-016   #define BN_H_
-017   
-018   #include <stdio.h>
-019   #include <string.h>
-020   #include <stdlib.h>
-021   #include <ctype.h>
-022   #include <limits.h>
-023   
-024   #undef MIN
-025   #define MIN(x,y) ((x)<(y)?(x):(y))
-026   #undef MAX
-027   #define MAX(x,y) ((x)>(y)?(x):(y))
-028   
-029   #ifdef __cplusplus
-030   extern "C" \{
-031   
-032   /* C++ compilers don't like assigning void * to mp_digit * */
-033   #define  OPT_CAST  (mp_digit *)
-034   
-035   #else
-036   
-037   /* C on the other hand doesn't care */
-038   #define  OPT_CAST
-039   
-040   #endif
-041   
-042   /* some default configurations.
-043    *
-044    * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits
-045    * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits
-046    *
-047    * At the very least a mp_digit must be able to hold 7 bits
-048    * [any size beyond that is ok provided it doesn't overflow the data type]
-049    */
-050   #ifdef MP_8BIT
-051      typedef unsigned char      mp_digit;
-052      typedef unsigned short     mp_word;
-053   #elif defined(MP_16BIT)
-054      typedef unsigned short     mp_digit;
-055      typedef unsigned long      mp_word;
-056   #elif defined(MP_64BIT)
-057      /* for GCC only on supported platforms */
-058   #ifndef CRYPT
-059      typedef unsigned long long ulong64;
-060      typedef signed long long   long64;
-061   #endif
-062   
-063      typedef ulong64            mp_digit;
-064      typedef unsigned long      mp_word __attribute__ ((mode(TI)));
-065   
-066      #define DIGIT_BIT          60
-067   #else
-068      /* this is the default case, 28-bit digits */
-069      
-070      /* this is to make porting into LibTomCrypt easier :-) */
-071   #ifndef CRYPT
-072      #if defined(_MSC_VER) || defined(__BORLANDC__) 
-073         typedef unsigned __int64   ulong64;
-074         typedef signed __int64     long64;
-075      #else
-076         typedef unsigned long long ulong64;
-077         typedef signed long long   long64;
-078      #endif
-079   #endif
-080   
-081      typedef unsigned long      mp_digit;
-082      typedef ulong64            mp_word;
-083   
-084   #ifdef MP_31BIT   
-085      #define DIGIT_BIT          31
-086   #else
-087      #define DIGIT_BIT          28
-088   #endif   
-089   #endif
-090   
-091   /* otherwise the bits per digit is calculated automatically from the size of
-       a mp_digit */
-092   #ifndef DIGIT_BIT
-093      #define DIGIT_BIT     ((CHAR_BIT * sizeof(mp_digit) - 1))  /* bits per di
-      git */
-094   #endif
-095   
-096   
-097   #define MP_DIGIT_BIT     DIGIT_BIT
-098   #define MP_MASK          ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit)
-      1))
-099   #define MP_DIGIT_MAX     MP_MASK
-100   
-101   /* equalities */
-102   #define MP_LT        -1   /* less than */
-103   #define MP_EQ         0   /* equal to */
-104   #define MP_GT         1   /* greater than */
-105   
-106   #define MP_ZPOS       0   /* positive integer */
-107   #define MP_NEG        1   /* negative */
-108   
-109   #define MP_OKAY       0   /* ok result */
-110   #define MP_MEM        -2  /* out of mem */
-111   #define MP_VAL        -3  /* invalid input */
-112   #define MP_RANGE      MP_VAL
-113   
-114   typedef int           mp_err;
-115   
-116   /* you'll have to tune these... */
-117   extern int KARATSUBA_MUL_CUTOFF,
-118              KARATSUBA_SQR_CUTOFF,
-119              TOOM_MUL_CUTOFF,
-120              TOOM_SQR_CUTOFF;
-121   
-122   /* various build options */
-123   #define MP_PREC                 64      /* default digits of precision (must
-       be power of two) */
-124   
-125   /* define this to use lower memory usage routines (exptmods mostly) */
-126   /* #define MP_LOW_MEM */
-127   
-128   /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER
-      _DIGIT*2) */
-129   #define MP_WARRAY               (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGI
-      T_BIT + 1))
-130   
-131   typedef struct  \{
-132       int used, alloc, sign;
-133       mp_digit *dp;
-134   \} mp_int;
-135   
-136   #define USED(m)    ((m)->used)
-137   #define DIGIT(m,k) ((m)->dp[k])
-138   #define SIGN(m)    ((m)->sign)
-139   
-140   /* ---> init and deinit bignum functions <--- */
-141   
-142   /* init a bignum */
-143   int mp_init(mp_int *a);
-144   
-145   /* free a bignum */
-146   void mp_clear(mp_int *a);
-147   
-148   /* init a null terminated series of arguments */
-149   int mp_init_multi(mp_int *mp, ...);
-150   
-151   /* clear a null terminated series of arguments */
-152   void mp_clear_multi(mp_int *mp, ...);
-153   
-154   /* exchange two ints */
-155   void mp_exch(mp_int *a, mp_int *b);
-156   
-157   /* shrink ram required for a bignum */
-158   int mp_shrink(mp_int *a);
-159   
-160   /* grow an int to a given size */
-161   int mp_grow(mp_int *a, int size);
-162   
-163   /* init to a given number of digits */
-164   int mp_init_size(mp_int *a, int size);
-165   
-166   /* ---> Basic Manipulations <--- */
-167   
-168   #define mp_iszero(a) (((a)->used == 0) ? 1 : 0)
-169   #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0)
-170   #define mp_isodd(a)  (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0)
-171   
-172   /* set to zero */
-173   void mp_zero(mp_int *a);
-174   
-175   /* set to a digit */
-176   void mp_set(mp_int *a, mp_digit b);
-177   
-178   /* set a 32-bit const */
-179   int mp_set_int(mp_int *a, unsigned int b);
-180   
-181   /* copy, b = a */
-182   int mp_copy(mp_int *a, mp_int *b);
-183   
-184   /* inits and copies, a = b */
-185   int mp_init_copy(mp_int *a, mp_int *b);
-186   
-187   /* trim unused digits */
-188   void mp_clamp(mp_int *a);
-189   
-190   /* ---> digit manipulation <--- */
-191   
-192   /* right shift by "b" digits */
-193   void mp_rshd(mp_int *a, int b);
-194   
-195   /* left shift by "b" digits */
-196   int mp_lshd(mp_int *a, int b);
-197   
-198   /* c = a / 2**b */
-199   int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d);
-200   
-201   /* b = a/2 */
-202   int mp_div_2(mp_int *a, mp_int *b);
-203   
-204   /* c = a * 2**b */
-205   int mp_mul_2d(mp_int *a, int b, mp_int *c);
-206   
-207   /* b = a*2 */
-208   int mp_mul_2(mp_int *a, mp_int *b);
-209   
-210   /* c = a mod 2**d */
-211   int mp_mod_2d(mp_int *a, int b, mp_int *c);
-212   
-213   /* computes a = 2**b */
-214   int mp_2expt(mp_int *a, int b);
-215   
-216   /* makes a pseudo-random int of a given size */
-217   int mp_rand(mp_int *a, int digits);
-218   
-219   /* ---> binary operations <--- */
-220   /* c = a XOR b  */
-221   int mp_xor(mp_int *a, mp_int *b, mp_int *c);
-222   
-223   /* c = a OR b */
-224   int mp_or(mp_int *a, mp_int *b, mp_int *c);
-225   
-226   /* c = a AND b */
-227   int mp_and(mp_int *a, mp_int *b, mp_int *c);
-228   
-229   /* ---> Basic arithmetic <--- */
-230   
-231   /* b = -a */
-232   int mp_neg(mp_int *a, mp_int *b);
-233   
-234   /* b = |a| */
-235   int mp_abs(mp_int *a, mp_int *b);
-236   
-237   /* compare a to b */
-238   int mp_cmp(mp_int *a, mp_int *b);
-239   
-240   /* compare |a| to |b| */
-241   int mp_cmp_mag(mp_int *a, mp_int *b);
-242   
-243   /* c = a + b */
-244   int mp_add(mp_int *a, mp_int *b, mp_int *c);
-245   
-246   /* c = a - b */
-247   int mp_sub(mp_int *a, mp_int *b, mp_int *c);
-248   
-249   /* c = a * b */
-250   int mp_mul(mp_int *a, mp_int *b, mp_int *c);
-251   
-252   /* b = a*a  */
-253   int mp_sqr(mp_int *a, mp_int *b);
-254   
-255   /* a/b => cb + d == a */
-256   int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-257   
-258   /* c = a mod b, 0 <= c < b  */
-259   int mp_mod(mp_int *a, mp_int *b, mp_int *c);
-260   
-261   /* ---> single digit functions <--- */
-262   
-263   /* compare against a single digit */
-264   int mp_cmp_d(mp_int *a, mp_digit b);
-265   
-266   /* c = a + b */
-267   int mp_add_d(mp_int *a, mp_digit b, mp_int *c);
-268   
-269   /* c = a - b */
-270   int mp_sub_d(mp_int *a, mp_digit b, mp_int *c);
-271   
-272   /* c = a * b */
-273   int mp_mul_d(mp_int *a, mp_digit b, mp_int *c);
-274   
-275   /* a/b => cb + d == a */
-276   int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d);
-277   
-278   /* a/3 => 3c + d == a */
-279   int mp_div_3(mp_int *a, mp_int *c, mp_digit *d);
-280   
-281   /* c = a**b */
-282   int mp_expt_d(mp_int *a, mp_digit b, mp_int *c);
-283   
-284   /* c = a mod b, 0 <= c < b  */
-285   int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c);
-286   
-287   /* ---> number theory <--- */
-288   
-289   /* d = a + b (mod c) */
-290   int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-291   
-292   /* d = a - b (mod c) */
-293   int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-294   
-295   /* d = a * b (mod c) */
-296   int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-297   
-298   /* c = a * a (mod b) */
-299   int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c);
-300   
-301   /* c = 1/a (mod b) */
-302   int mp_invmod(mp_int *a, mp_int *b, mp_int *c);
-303   
-304   /* c = (a, b) */
-305   int mp_gcd(mp_int *a, mp_int *b, mp_int *c);
-306   
-307   /* c = [a, b] or (a*b)/(a, b) */
-308   int mp_lcm(mp_int *a, mp_int *b, mp_int *c);
-309   
-310   /* finds one of the b'th root of a, such that |c|**b <= |a|
-311    *
-312    * returns error if a < 0 and b is even
-313    */
-314   int mp_n_root(mp_int *a, mp_digit b, mp_int *c);
-315   
-316   /* shortcut for square root */
-317   #define mp_sqrt(a, b) mp_n_root(a, 2, b)
-318   
-319   /* computes the jacobi c = (a | n) (or Legendre if b is prime)  */
-320   int mp_jacobi(mp_int *a, mp_int *n, int *c);
-321   
-322   /* used to setup the Barrett reduction for a given modulus b */
-323   int mp_reduce_setup(mp_int *a, mp_int *b);
-324   
-325   /* Barrett Reduction, computes a (mod b) with a precomputed value c
-326    *
-327    * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely
-328    * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code].
-329    */
-330   int mp_reduce(mp_int *a, mp_int *b, mp_int *c);
-331   
-332   /* setups the montgomery reduction */
-333   int mp_montgomery_setup(mp_int *a, mp_digit *mp);
-334   
-335   /* computes a = B**n mod b without division or multiplication useful for
-336    * normalizing numbers in a Montgomery system.
-337    */
-338   int mp_montgomery_calc_normalization(mp_int *a, mp_int *b);
-339   
-340   /* computes x/R == x (mod N) via Montgomery Reduction */
-341   int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
-342   
-343   /* returns 1 if a is a valid DR modulus */
-344   int mp_dr_is_modulus(mp_int *a);
-345   
-346   /* sets the value of "d" required for mp_dr_reduce */
-347   void mp_dr_setup(mp_int *a, mp_digit *d);
-348   
-349   /* reduces a modulo b using the Diminished Radix method */
-350   int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp);
-351   
-352   /* returns true if a can be reduced with mp_reduce_2k */
-353   int mp_reduce_is_2k(mp_int *a);
-354   
-355   /* determines k value for 2k reduction */
-356   int mp_reduce_2k_setup(mp_int *a, mp_digit *d);
-357   
-358   /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */
-359   int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k);
-360   
-361   /* d = a**b (mod c) */
-362   int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d);
-363   
-364   /* ---> Primes <--- */
-365   
-366   /* number of primes */
-367   #ifdef MP_8BIT
-368      #define PRIME_SIZE      31
-369   #else
-370      #define PRIME_SIZE      256
-371   #endif
-372   
-373   /* table of first PRIME_SIZE primes */
-374   extern const mp_digit __prime_tab[];
-375   
-376   /* result=1 if a is divisible by one of the first PRIME_SIZE primes */
-377   int mp_prime_is_divisible(mp_int *a, int *result);
-378   
-379   /* performs one Fermat test of "a" using base "b".
-380    * Sets result to 0 if composite or 1 if probable prime
-381    */
-382   int mp_prime_fermat(mp_int *a, mp_int *b, int *result);
-383   
-384   /* performs one Miller-Rabin test of "a" using base "b".
-385    * Sets result to 0 if composite or 1 if probable prime
-386    */
-387   int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result);
-388   
-389   /* performs t rounds of Miller-Rabin on "a" using the first
-390    * t prime bases.  Also performs an initial sieve of trial
-391    * division.  Determines if "a" is prime with probability
-392    * of error no more than (1/4)**t.
-393    *
-394    * Sets result to 1 if probably prime, 0 otherwise
-395    */
-396   int mp_prime_is_prime(mp_int *a, int t, int *result);
-397   
-398   /* finds the next prime after the number "a" using "t" trials
-399    * of Miller-Rabin.
-400    */
-401   int mp_prime_next_prime(mp_int *a, int t);
-402   
-403   
-404   /* ---> radix conversion <--- */
-405   int mp_count_bits(mp_int *a);
-406   
-407   int mp_unsigned_bin_size(mp_int *a);
-408   int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c);
-409   int mp_to_unsigned_bin(mp_int *a, unsigned char *b);
-410   
-411   int mp_signed_bin_size(mp_int *a);
-412   int mp_read_signed_bin(mp_int *a, unsigned char *b, int c);
-413   int mp_to_signed_bin(mp_int *a, unsigned char *b);
-414   
-415   int mp_read_radix(mp_int *a, char *str, int radix);
-416   int mp_toradix(mp_int *a, char *str, int radix);
-417   int mp_radix_size(mp_int *a, int radix);
-418   
-419   int mp_fread(mp_int *a, int radix, FILE *stream);
-420   int mp_fwrite(mp_int *a, int radix, FILE *stream);
-421   
-422   #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len))
-423   #define mp_raw_size(mp)           mp_signed_bin_size(mp)
-424   #define mp_toraw(mp, str)         mp_to_signed_bin((mp), (str))
-425   #define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len))
-426   #define mp_mag_size(mp)           mp_unsigned_bin_size(mp)
-427   #define mp_tomag(mp, str)         mp_to_unsigned_bin((mp), (str))
-428   
-429   #define mp_tobinary(M, S)  mp_toradix((M), (S), 2)
-430   #define mp_tooctal(M, S)   mp_toradix((M), (S), 8)
-431   #define mp_todecimal(M, S) mp_toradix((M), (S), 10)
-432   #define mp_tohex(M, S)     mp_toradix((M), (S), 16)
-433   
-434   /* lowlevel functions, do not call! */
-435   int s_mp_add(mp_int *a, mp_int *b, mp_int *c);
-436   int s_mp_sub(mp_int *a, mp_int *b, mp_int *c);
-437   #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1)
-438   int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
-439   int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
-440   int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
-441   int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs);
-442   int fast_s_mp_sqr(mp_int *a, mp_int *b);
-443   int s_mp_sqr(mp_int *a, mp_int *b);
-444   int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c);
-445   int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c);
-446   int mp_karatsuba_sqr(mp_int *a, mp_int *b);
-447   int mp_toom_sqr(mp_int *a, mp_int *b);
-448   int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c);
-449   int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp);
-450   int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode);
-451   int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y);
-452   void bn_reverse(unsigned char *s, int len);
-453   
-454   #ifdef __cplusplus
-455      \}
-456   #endif
-457   
-458   #endif
-459   
-\end{alltt}
-\end{small}
-
-\end{document}
\ No newline at end of file