diff --git a/bn.pdf b/bn.pdf index 37866c8..bdbc819 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index a69c13c..3c07991 100644 --- a/bn.tex +++ b/bn.tex @@ -1,7 +1,7 @@ \documentclass[]{article} \begin{document} -\title{LibTomMath v0.19 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org } +\title{LibTomMath v0.20 \\ A Free Multiple Precision Integer Library \\ http://math.libtomcrypt.org } \author{Tom St Denis \\ tomstdenis@iahu.ca} \maketitle \newpage diff --git a/bn_mp_exptmod_fast.c b/bn_mp_exptmod_fast.c index de42ff8..54de53d 100644 --- a/bn_mp_exptmod_fast.c +++ b/bn_mp_exptmod_fast.c @@ -80,7 +80,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) if (((P->used * 2 + 1) < MP_WARRAY) && P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { redux = fast_mp_montgomery_reduce; - } else { /* use slower baselien method */ redux = mp_montgomery_reduce; diff --git a/changes.txt b/changes.txt index f874a2b..5756d6a 100644 --- a/changes.txt +++ b/changes.txt @@ -1,3 +1,7 @@ +June 8th, 2003 +v0.20 -- Removed the book from the package. Added the TDCAL license document. + -- This release is officially pure-bred TDCAL again [last officially TDCAL based release was v0.16] + June 6th, 2003 v0.19 -- Fixed a bug in mp_montgomery_reduce() which was introduced when I tweaked mp_rshd() in the previous release. Essentially the digits were not trimmed before the compare which cause a subtraction to occur all the time. diff --git a/etc/2kprime.1 b/etc/2kprime.1 index eb12565..e1384db 100644 --- a/etc/2kprime.1 +++ b/etc/2kprime.1 @@ -1,2 +1 @@ -256-bits (k = 36113) = 115792089237316195423570985008687907853269984665640564039457584007913129603823 -512-bits (k = 38117) = 13407807929942597099574024998205846127479365820592393377723561443721764030073546976801874298166903427690031858186486050853753882811946569946433649006045979 +259-bits (k = 17745) = 926336713898529563388567880069503262826159877325124512315660672063305037101743 diff --git a/etc/2kprime.c b/etc/2kprime.c index 47b0e1d..4f1d4bb 100644 --- a/etc/2kprime.c +++ b/etc/2kprime.c @@ -7,7 +7,7 @@ int sizes[] = {256, 512, 768, 1024, 1536, 2048, 3072, 4096}; int main(void) { char buf[2000]; - int x, y, t; + int x, y; mp_int q, p; FILE *out; clock_t t1; diff --git a/makefile b/makefile index 15972af..a835a2e 100644 --- a/makefile +++ b/makefile @@ -1,6 +1,6 @@ CFLAGS += -I./ -Wall -W -Wshadow -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.19 +VERSION=0.20 default: libtommath.a @@ -103,5 +103,6 @@ clean: zipup: clean manual poster perl gen.pl ; mv mpi.c pre_gen/ ; \ cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ - cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ + cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; cp tdcal.pdf ./libtommath-$(VERSION)/ ; cd ./libtommath-$(VERSION) ; rm -f tommath.src tommath.tex tommath.out ; cd pics ; rm -f * ; cd .. ; cd .. ; ls ; \ + tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* diff --git a/pics/expt_state.sxd b/pics/expt_state.sxd deleted file mode 100644 index 6518404..0000000 Binary files a/pics/expt_state.sxd and /dev/null differ diff --git a/pics/expt_state.tif b/pics/expt_state.tif deleted file mode 100644 index cb06e8e..0000000 Binary files a/pics/expt_state.tif and /dev/null differ diff --git a/pics/makefile b/pics/makefile deleted file mode 100644 index 302adec..0000000 --- a/pics/makefile +++ /dev/null @@ -1,23 +0,0 @@ -# makes the images... yeah - -default: pses - - -sliding_window.ps: sliding_window.tif - tiff2ps -c -e sliding_window.tif > sliding_window.ps - -expt_state.ps: expt_state.tif - tiff2ps -c -e expt_state.tif > expt_state.ps - -sliding_window.pdf: sliding_window.ps - epstopdf sliding_window.ps - -expt_state.pdf: expt_state.ps - epstopdf expt_state.ps - -pses: sliding_window.ps expt_state.ps -pdfes: sliding_window.pdf expt_state.pdf - -clean: - rm -rf *.ps *.pdf .xvpics - \ No newline at end of file diff --git a/pics/sliding_window.TIF b/pics/sliding_window.TIF deleted file mode 100644 index bb4cb96..0000000 Binary files a/pics/sliding_window.TIF and /dev/null differ diff --git a/pics/sliding_window.sxd b/pics/sliding_window.sxd deleted file mode 100644 index 91e7c0d..0000000 Binary files a/pics/sliding_window.sxd and /dev/null differ diff --git a/poster.pdf b/poster.pdf index a1ecd06..629336c 100644 Binary files a/poster.pdf and b/poster.pdf differ diff --git a/pre_gen/mpi.c b/pre_gen/mpi.c index e5b9347..c728269 100644 --- a/pre_gen/mpi.c +++ b/pre_gen/mpi.c @@ -2155,7 +2155,6 @@ mp_exptmod_fast (mp_int * G, mp_int * X, mp_int * P, mp_int * Y, int redmode) if (((P->used * 2 + 1) < MP_WARRAY) && P->used < (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) { redux = fast_mp_montgomery_reduce; - } else { /* use slower baselien method */ redux = mp_montgomery_reduce; diff --git a/tdcal.pdf b/tdcal.pdf new file mode 100644 index 0000000..1566a9d Binary files /dev/null and b/tdcal.pdf differ diff --git a/tommath.out b/tommath.out deleted file mode 100644 index fb54c12..0000000 --- a/tommath.out +++ /dev/null @@ -1,143 +0,0 @@ -\BOOKMARK [0][-]{chapter.1}{Introduction}{} -\BOOKMARK [1][-]{section.1.1}{Multiple Precision Arithmetic}{chapter.1} -\BOOKMARK [2][-]{subsection.1.1.1}{The Need for Multiple Precision Arithmetic}{section.1.1} -\BOOKMARK [2][-]{subsection.1.1.2}{Multiple Precision Arithmetic}{section.1.1} -\BOOKMARK [2][-]{subsection.1.1.3}{Benefits of Multiple Precision Arithmetic}{section.1.1} -\BOOKMARK [2][-]{subsection.1.1.4}{Basis of Operations}{section.1.1} -\BOOKMARK [1][-]{section.1.2}{Purpose of This Text}{chapter.1} -\BOOKMARK [1][-]{section.1.3}{Discussion and Notation}{chapter.1} -\BOOKMARK [2][-]{subsection.1.3.1}{Notation}{section.1.3} -\BOOKMARK [2][-]{subsection.1.3.2}{Work Effort}{section.1.3} -\BOOKMARK [1][-]{section.1.4}{Exercises}{chapter.1} -\BOOKMARK [0][-]{chapter.2}{Introduction to LibTomMath}{} -\BOOKMARK [1][-]{section.2.1}{What is LibTomMath?}{chapter.2} -\BOOKMARK [1][-]{section.2.2}{Goals of LibTomMath}{chapter.2} -\BOOKMARK [1][-]{section.2.3}{Choice of LibTomMath}{chapter.2} -\BOOKMARK [2][-]{subsection.2.3.1}{Code Base}{section.2.3} -\BOOKMARK [2][-]{subsection.2.3.2}{API Simplicity}{section.2.3} -\BOOKMARK [2][-]{subsection.2.3.3}{Optimizations}{section.2.3} -\BOOKMARK [2][-]{subsection.2.3.4}{Portability and Stability}{section.2.3} -\BOOKMARK [2][-]{subsection.2.3.5}{Choice}{section.2.3} -\BOOKMARK [0][-]{chapter.3}{Getting Started}{} -\BOOKMARK [1][-]{section.3.1}{Library Basics}{chapter.3} -\BOOKMARK [1][-]{section.3.2}{What is a Multiple Precision Integer?}{chapter.3} -\BOOKMARK [2][-]{subsection.3.2.1}{The mp\137int structure}{section.3.2} -\BOOKMARK [1][-]{section.3.3}{Argument Passing}{chapter.3} -\BOOKMARK [1][-]{section.3.4}{Return Values}{chapter.3} -\BOOKMARK [1][-]{section.3.5}{Initialization and Clearing}{chapter.3} -\BOOKMARK [2][-]{subsection.3.5.1}{Initializing an mp\137int}{section.3.5} -\BOOKMARK [2][-]{subsection.3.5.2}{Clearing an mp\137int}{section.3.5} -\BOOKMARK [1][-]{section.3.6}{Other Initialization Routines}{chapter.3} -\BOOKMARK [2][-]{subsection.3.6.1}{Initializing Variable Sized mp\137int Structures}{section.3.6} -\BOOKMARK [2][-]{subsection.3.6.2}{Creating a Clone}{section.3.6} -\BOOKMARK [2][-]{subsection.3.6.3}{Multiple Integer Initializations And Clearings}{section.3.6} -\BOOKMARK [1][-]{section.3.7}{Maintenance}{chapter.3} -\BOOKMARK [2][-]{subsection.3.7.1}{Augmenting Integer Precision}{section.3.7} -\BOOKMARK [2][-]{subsection.3.7.2}{Clamping Excess Digits}{section.3.7} -\BOOKMARK [0][-]{chapter.4}{Basic Operations}{} -\BOOKMARK [1][-]{section.4.1}{Copying an Integer}{chapter.4} -\BOOKMARK [1][-]{section.4.2}{Zeroing an Integer}{chapter.4} -\BOOKMARK [1][-]{section.4.3}{Sign Manipulation}{chapter.4} -\BOOKMARK [2][-]{subsection.4.3.1}{Absolute Value}{section.4.3} -\BOOKMARK [2][-]{subsection.4.3.2}{Integer Negation}{section.4.3} -\BOOKMARK [1][-]{section.4.4}{Small Constants}{chapter.4} -\BOOKMARK [2][-]{subsection.4.4.1}{Setting Small Constants}{section.4.4} -\BOOKMARK [2][-]{subsection.4.4.2}{Setting Large Constants}{section.4.4} -\BOOKMARK [1][-]{section.4.5}{Comparisons}{chapter.4} -\BOOKMARK [2][-]{subsection.4.5.1}{Unsigned Comparisions}{section.4.5} -\BOOKMARK [2][-]{subsection.4.5.2}{Signed Comparisons}{section.4.5} -\BOOKMARK [0][-]{chapter.5}{Basic Arithmetic}{} -\BOOKMARK [1][-]{section.5.1}{Building Blocks}{chapter.5} -\BOOKMARK [1][-]{section.5.2}{Addition and Subtraction}{chapter.5} -\BOOKMARK [2][-]{subsection.5.2.1}{Low Level Addition}{section.5.2} -\BOOKMARK [2][-]{subsection.5.2.2}{Low Level Subtraction}{section.5.2} -\BOOKMARK [2][-]{subsection.5.2.3}{High Level Addition}{section.5.2} -\BOOKMARK [2][-]{subsection.5.2.4}{High Level Subtraction}{section.5.2} -\BOOKMARK [1][-]{section.5.3}{Bit and Digit Shifting}{chapter.5} -\BOOKMARK [2][-]{subsection.5.3.1}{Multiplication by Two}{section.5.3} -\BOOKMARK [2][-]{subsection.5.3.2}{Division by Two}{section.5.3} -\BOOKMARK [1][-]{section.5.4}{Polynomial Basis Operations}{chapter.5} -\BOOKMARK [2][-]{subsection.5.4.1}{Multiplication by x}{section.5.4} -\BOOKMARK [2][-]{subsection.5.4.2}{Division by x}{section.5.4} -\BOOKMARK [1][-]{section.5.5}{Powers of Two}{chapter.5} -\BOOKMARK [2][-]{subsection.5.5.1}{Multiplication by Power of Two}{section.5.5} -\BOOKMARK [2][-]{subsection.5.5.2}{Division by Power of Two}{section.5.5} -\BOOKMARK [2][-]{subsection.5.5.3}{Remainder of Division by Power of Two}{section.5.5} -\BOOKMARK [0][-]{chapter.6}{Multiplication and Squaring}{} -\BOOKMARK [1][-]{section.6.1}{The Multipliers}{chapter.6} -\BOOKMARK [1][-]{section.6.2}{Multiplication}{chapter.6} -\BOOKMARK [2][-]{subsection.6.2.1}{The Baseline Multiplication}{section.6.2} -\BOOKMARK [2][-]{subsection.6.2.2}{Faster Multiplication by the ``Comba'' Method}{section.6.2} -\BOOKMARK [2][-]{subsection.6.2.3}{Polynomial Basis Multiplication}{section.6.2} -\BOOKMARK [2][-]{subsection.6.2.4}{Karatsuba Multiplication}{section.6.2} -\BOOKMARK [2][-]{subsection.6.2.5}{Toom-Cook 3-Way Multiplication}{section.6.2} -\BOOKMARK [2][-]{subsection.6.2.6}{Signed Multiplication}{section.6.2} -\BOOKMARK [1][-]{section.6.3}{Squaring}{chapter.6} -\BOOKMARK [2][-]{subsection.6.3.1}{The Baseline Squaring Algorithm}{section.6.3} -\BOOKMARK [2][-]{subsection.6.3.2}{Faster Squaring by the ``Comba'' Method}{section.6.3} -\BOOKMARK [2][-]{subsection.6.3.3}{Polynomial Basis Squaring}{section.6.3} -\BOOKMARK [2][-]{subsection.6.3.4}{Karatsuba Squaring}{section.6.3} -\BOOKMARK [2][-]{subsection.6.3.5}{Toom-Cook Squaring}{section.6.3} -\BOOKMARK [2][-]{subsection.6.3.6}{High Level Squaring}{section.6.3} -\BOOKMARK [0][-]{chapter.7}{Modular Reduction}{} -\BOOKMARK [1][-]{section.7.1}{Basics of Modular Reduction}{chapter.7} -\BOOKMARK [1][-]{section.7.2}{The Barrett Reduction}{chapter.7} -\BOOKMARK [2][-]{subsection.7.2.1}{Fixed Point Arithmetic}{section.7.2} -\BOOKMARK [2][-]{subsection.7.2.2}{Choosing a Radix Point}{section.7.2} -\BOOKMARK [2][-]{subsection.7.2.3}{Trimming the Quotient}{section.7.2} -\BOOKMARK [2][-]{subsection.7.2.4}{Trimming the Residue}{section.7.2} -\BOOKMARK [2][-]{subsection.7.2.5}{The Barrett Algorithm}{section.7.2} -\BOOKMARK [2][-]{subsection.7.2.6}{The Barrett Setup Algorithm}{section.7.2} -\BOOKMARK [1][-]{section.7.3}{The Montgomery Reduction}{chapter.7} -\BOOKMARK [2][-]{subsection.7.3.1}{Digit Based Montgomery Reduction}{section.7.3} -\BOOKMARK [2][-]{subsection.7.3.2}{Baseline Montgomery Reduction}{section.7.3} -\BOOKMARK [2][-]{subsection.7.3.3}{Faster ``Comba'' Montgomery Reduction}{section.7.3} -\BOOKMARK [2][-]{subsection.7.3.4}{Montgomery Setup}{section.7.3} -\BOOKMARK [1][-]{section.7.4}{The Diminished Radix Algorithm}{chapter.7} -\BOOKMARK [2][-]{subsection.7.4.1}{Choice of Moduli}{section.7.4} -\BOOKMARK [2][-]{subsection.7.4.2}{Choice of k}{section.7.4} -\BOOKMARK [2][-]{subsection.7.4.3}{Restricted Diminished Radix Reduction}{section.7.4} -\BOOKMARK [2][-]{subsection.7.4.4}{Unrestricted Diminished Radix Reduction}{section.7.4} -\BOOKMARK [1][-]{section.7.5}{Algorithm Comparison}{chapter.7} -\BOOKMARK [0][-]{chapter.8}{Exponentiation}{} -\BOOKMARK [1][-]{section.8.1}{Exponentiation Basics}{chapter.8} -\BOOKMARK [2][-]{subsection.8.1.1}{Single Digit Exponentiation}{section.8.1} -\BOOKMARK [1][-]{section.8.2}{k-ary Exponentiation}{chapter.8} -\BOOKMARK [2][-]{subsection.8.2.1}{Optimal Values of k}{section.8.2} -\BOOKMARK [2][-]{subsection.8.2.2}{Sliding-Window Exponentiation}{section.8.2} -\BOOKMARK [1][-]{section.8.3}{Modular Exponentiation}{chapter.8} -\BOOKMARK [2][-]{subsection.8.3.1}{Barrett Modular Exponentiation}{section.8.3} -\BOOKMARK [1][-]{section.8.4}{Quick Power of Two}{chapter.8} -\BOOKMARK [0][-]{chapter.9}{Higher Level Algorithms}{} -\BOOKMARK [1][-]{section.9.1}{Integer Division with Remainder}{chapter.9} -\BOOKMARK [1][-]{section.9.2}{Single Digit Helpers}{chapter.9} -\BOOKMARK [2][-]{subsection.9.2.1}{Single Digit Addition}{section.9.2} -\BOOKMARK [2][-]{subsection.9.2.2}{Single Digit Subtraction}{section.9.2} -\BOOKMARK [2][-]{subsection.9.2.3}{Single Digit Multiplication}{section.9.2} -\BOOKMARK [2][-]{subsection.9.2.4}{Single Digit Division}{section.9.2} -\BOOKMARK [2][-]{subsection.9.2.5}{Single Digit Modulo}{section.9.2} -\BOOKMARK [2][-]{subsection.9.2.6}{Single Digit Root Extraction}{section.9.2} -\BOOKMARK [1][-]{section.9.3}{Random Number Generation}{chapter.9} -\BOOKMARK [1][-]{section.9.4}{Formatted Output}{chapter.9} -\BOOKMARK [2][-]{subsection.9.4.1}{Getting The Output Size}{section.9.4} -\BOOKMARK [2][-]{subsection.9.4.2}{Generating Radix-n Output}{section.9.4} -\BOOKMARK [2][-]{subsection.9.4.3}{Reading Radix-n Input}{section.9.4} -\BOOKMARK [1][-]{section.9.5}{Unformatted Output}{chapter.9} -\BOOKMARK [2][-]{subsection.9.5.1}{Getting The Output Size}{section.9.5} -\BOOKMARK [2][-]{subsection.9.5.2}{Generating Output}{section.9.5} -\BOOKMARK [2][-]{subsection.9.5.3}{Reading Input}{section.9.5} -\BOOKMARK [0][-]{chapter.10}{Number Theoretic Algorithms}{} -\BOOKMARK [1][-]{section.10.1}{Greatest Common Divisor}{chapter.10} -\BOOKMARK [1][-]{section.10.2}{Least Common Multiple}{chapter.10} -\BOOKMARK [1][-]{section.10.3}{Jacobi Symbol Computation}{chapter.10} -\BOOKMARK [1][-]{section.10.4}{Modular Inverse}{chapter.10} -\BOOKMARK [2][-]{subsection.10.4.1}{General Case}{section.10.4} -\BOOKMARK [2][-]{subsection.10.4.2}{Odd Moduli}{section.10.4} -\BOOKMARK [1][-]{section.10.5}{Primality Tests}{chapter.10} -\BOOKMARK [2][-]{subsection.10.5.1}{Trial Division}{section.10.5} -\BOOKMARK [2][-]{subsection.10.5.2}{The Fermat Test}{section.10.5} -\BOOKMARK [2][-]{subsection.10.5.3}{The Miller-Rabin Test}{section.10.5} -\BOOKMARK [2][-]{subsection.10.5.4}{Primality Test in a Bottle}{section.10.5} -\BOOKMARK [2][-]{subsection.10.5.5}{The Next Prime}{section.10.5} -\BOOKMARK [1][-]{section.10.6}{Root Extraction}{chapter.10} -\BOOKMARK [0][-]{appendix*.16}{Appendix}{} diff --git a/tommath.src b/tommath.src deleted file mode 100644 index fd11871..0000000 --- a/tommath.src +++ /dev/null @@ -1,4675 +0,0 @@ -\documentclass[b5paper]{book} -\usepackage{hyperref} -\usepackage{makeidx} -\usepackage{amssymb} -\usepackage{color} -\usepackage{alltt} -\usepackage{graphicx} -\usepackage{layout} -\def\union{\cup} -\def\intersect{\cap} -\def\getsrandom{\stackrel{\rm R}{\gets}} -\def\cross{\times} -\def\cat{\hspace{0.5em} \| \hspace{0.5em}} -\def\catn{$\|$} -\def\divides{\hspace{0.3em} | \hspace{0.3em}} -\def\nequiv{\not\equiv} -\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} -\def\lcm{{\rm lcm}} -\def\gcd{{\rm gcd}} -\def\log{{\rm log}} -\def\ord{{\rm ord}} -\def\abs{{\mathit abs}} -\def\rep{{\mathit rep}} -\def\mod{{\mathit\ mod\ }} -\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} -\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} -\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} -\def\Or{{\rm\ or\ }} -\def\And{{\rm\ and\ }} -\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} -\def\implies{\Rightarrow} -\def\undefined{{\rm ``undefined"}} -\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} -\let\oldphi\phi -\def\phi{\varphi} -\def\Pr{{\rm Pr}} -\newcommand{\str}[1]{{\mathbf{#1}}} -\def\F{{\mathbb F}} -\def\N{{\mathbb N}} -\def\Z{{\mathbb Z}} -\def\R{{\mathbb R}} -\def\C{{\mathbb C}} -\def\Q{{\mathbb Q}} -\definecolor{DGray}{gray}{0.5} -\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} -\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} -\def\gap{\vspace{0.5ex}} -\makeindex -\begin{document} -\frontmatter -\pagestyle{empty} -\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - } -\author{\mbox{ -%\begin{small} -\begin{tabular}{c} -Tom St Denis \\ -Algonquin College \\ -\\ -Mads Rasmussen \\ -Open Communications Security \\ -\\ -Greg Rose \\ -QUALCOMM Australia \\ -\end{tabular} -%\end{small} -} -} -\maketitle -This text in its entirety is copyright \copyright{}2003 by Tom St Denis. It may not be redistributed -electronically or otherwise without the sole permission of the author. The text is freely redistributable as long as -it is packaged along with the LibTomMath library in a non-commercial project. Contact the -author for other redistribution rights. - -This text corresponds to the v0.17 release of the LibTomMath project. - -\begin{alltt} -Tom St Denis -111 Banning Rd -Ottawa, Ontario -K2L 1C3 -Canada - -Phone: 1-613-836-3160 -Email: tomstdenis@iahu.ca -\end{alltt} - -This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} -{\em book} macro package and the Perl {\em booker} package. - -\tableofcontents -\listoffigures -\chapter*{Preface} -Blah. - -\mainmatter -\pagestyle{headings} -\chapter{Introduction} -\section{Multiple Precision Arithmetic} -\subsection{The Need for Multiple Precision Arithmetic} -The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public -key cryptography. Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to -resist known cryptanalytic attacks. Typical modern programming languages such as C and Java only provide small -single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long. - -For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type. With an -x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$. The original inputs -were approximately $21$ and $24$ bits respectively. If the C language cannot multiply two relatively small values -together precisely how does anyone expect it to multiply two values that are considerably larger? - -Most advancements in fast multiple precision arithmetic stem from the desire for faster cryptographic primitives. However, cryptography -is not the only field of study that can benefit from fast large integer routines. Another auxiliary use for multiple precision integers is -high precision floating point data types. The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$. -Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is specified. Since IEEE is meant to be implemented in -hardware the precision of the mantissa is often fairly small (\textit{23, 48 and 64 bits}). Since the mantissa is merely an -integer a large multiple precision integer could be used. In effect very high precision floating point arithmetic -could be performed. This would be useful where scientific applications must minimize the total output error over long simulations. - -\subsection{Multiple Precision Arithmetic} -\index{multiple precision} -Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from -the C and Java programming languages. In essence multiple precision arithmetic is a set of operations that can be -performed on members of an algebraic group whose precision is not fixed. The algorithms when implemented to be multiple -precision can allow a developer to work with any practical precision required. - -Typically the arithmetic over the ring of integers denoted by $\Z$ is performed by routines that are collectively and -casually referred to as ``bignum'' routines. However, it is possible to have rings of polynomials as well typically -denoted by $\Z/p\Z \left [ X \right ]$ which could have variable precision (\textit{or degree}). This text will -discuss implementation of the former, however implementing polynomial basis routines should be relatively easy after reading this text. - -\subsection{Benefits of Multiple Precision Arithmetic} -\index{precision} \index{accuracy} -Precision of the real value to a given precision is defined loosely as the proximity of the real value to a given representation. -Accuracy is defined as the reproducibility of the result. For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided -it is reproducible. - -The benefit of multiple precision representations over single precision representations is that -often no precision is lost while representing the result of an operation which requires excess precision. For example, -the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result. A multiple precision -system would augment the precision of the destination to accomodate the result while a single precision system would -truncate excess bits to maintain a fixed level of precision. - -Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of -modest computer resources. The only reasonable case where a multiple precision system will lose precision is when -emulating a floating point data type. However, with multiple precision integer arithmetic no precision is lost. - -\subsection{Basis of Operations} -At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learned as children -in grade school. For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for -$1,234$, instead they are taught how to long-multiply. That is to multiply each column using simple single digit -multiplications, line up the partial results, and add the resulting products by column. The representation that most -are familiar with is known as decimal or formally as radix-10. A radix-$n$ representation simply means there are -$n$ possible values per digit. For example, binary would be a radix-2 representation. - -In essence computer based multiple precision arithmetic is very much the same. The most notable difference is the usage -of a binary friendly radix. That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine -register. Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and -squaring instead of traditional long-hand algorithms. - -\section{Purpose of This Text} -The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms. That is -to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by -authors of other texts on the subject. Texts such as \cite[HAC]{HAC} and \cite{TAOCPV2} give considerably detailed -explanations of the theoretical aspects of the algorithms and very little regarding the practical aspects. - -How an algorithm is explained and how it is actually implemented are two very different -realities. For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple -precision integer addition. However, what the description lacks is any discussion concerning the fact that the two -integer inputs may be of differing magnitudes. Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) -does not discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{Step \#3}). - -As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required -such as ``Comba'' and Karatsuba multipliers and fast modular inversion. These optimal algorithms are vital to achieve -any form of useful performance in non-trivial applications. - -To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that -constitute a multiple precision integer package with light discussions on the theoretical aspects. As a case -study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate -algorithms with implementations that have been field tested and work very well. - -\section{Discussion and Notation} -\subsection{Notation} -A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the -multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$. The elements of the array $x$ are -said to be the radix $\beta$ digits of the integer. For example, $x = (1,2,3)_{10}$ would represent the -integer $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$. - -A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data -required to manipulate the data. These additional members are discussed in ~BASICOP~. For the purposes of this text -a ``multiple precision integer'' and a ``mp\_int'' are synonymous. - -\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word} -For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while -a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$. Within the source code that will be -presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a -double-precision type. In several algorithms (\textit{notably the Comba routines}) temporary results -will be stored in a double-precision arrays. For the purposes of this text $x_j$ will refer to the -$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision -array. - -The $\lfloor \mbox{ } \rfloor$ brackets represent a value truncated and rounded down to the nearest integer. The $\lceil \mbox{ } \rceil$ brackets -represent a value truncated and rounded up to the nearest integer. Typically when the $/$ division symbol is used the intention is to perform an integer -division. For example, $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity. When a value is presented as a fraction -such as $5 \over 2$ a real value division is implied. - -\subsection{Work Effort} -\index{big-O} -To measure the efficiency of various algorithms a modified big-O notation is used. In this system all -single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. -That is a single precision addition, multiplication and division are assumed to take the same time to -complete. While this is generally not true in practice it will simplify the discussions considerably. - -Some algorithms have slight advantages over others which is why some constants will not be removed in -the notation. For example, a normal multiplication requires $O(n^2)$ work while a squaring requires -$O({{n^2 + n}\over 2})$ work. In standard big-O notation these would be said to be equivalent. However, in the -context of the this text the magnitude of the inputs will not approach an infinite size. This means the conventional limit -notation wisdom does not apply to the cancellation of constants. - -Throughout the discussions various ``work levels'' will be discussed. These levels are the $O(1)$, -$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts. For example, operations at the $O(n^k)$ ``level'' are said to be -executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$. Obviously most optimizations will pay -off the most at the higher levels since they represent the bulk of the effort required. - -\section{Exercises} -Within the more advanced chapters a section will be set aside to give the reader some challenging exercises. These exercises are not -designed to be prize winning problems, but to be thought provoking. Wherever possible the problems are forward minded stating -problems that will be answered in subsequent chapters. The reader is encouraged to finish the exercises as they appear to get a -better understanding of the subject material. - -Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system. However, unlike -\cite{TAOCPV2} the problems do not get nearly as hard as often. The scoring of these exercises ranges from one (\textit{the easiest}) to -five (\textit{the hardest}). The following table sumarizes the scoring. - -\vspace{5mm} -\begin{tabular}{cl} -$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ - & minutes to solve. Usually does not involve much computer time. \\ - & \\ -$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ - & time usage. Usually requires a program to be written to \\ - & solve the problem. \\ - & \\ -$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ - & of work. Usually involves trivial research and development of \\ - & new theory from the perspective of a student. \\ - & \\ -$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ - & of work and research. The solution to which will demonstrate \\ - & a higher mastery of the subject matter. \\ - & \\ -$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial. \\ - & Solutions to these problems will demonstrate a complete mastery \\ - & of the given subject. \\ - & \\ -\end{tabular} - -Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or -devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level are also -designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. - -Problems at the third level are meant to be a bit more difficult. Often the answer is fairly obvious but arriving at an exacting solution -requires some thought and skill. These problems will almost always involve devising a new algorithm or implementing a variation of -another algorithm. - -Problems at the fourth level are meant to be even more difficult as well as involve some research. The reader will most likely not know -the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}). Problems -at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter. People who can correctly -answer fifth level problems have a mastery of the subject matter at hand. - -Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader -is encouraged to answer the follow-up problems and try to draw the relevence of problems. - -\chapter{Introduction to LibTomMath} - -\section{What is LibTomMath?} -LibTomMath is a free and open source multiple precision library written in portable ISO C source code. By portable it is -meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on any -given platform. The library has been successfully tested under numerous operating systems including Solaris, MacOS, Windows, -Linux, PalmOS and on standalone hardware such as the Gameboy Advance. The library is designed to contain enough -functionality to be able to develop applications such as public key cryptosystems. - -\section{Goals of LibTomMath} - -Even though the library is written entirely in portable ISO C considerable care has been taken to -optimize the algorithm implementations within the library. Specifically the code has been written to work well with -the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors. Wherever possible highly efficient -algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction}) have -been provided to make the library as efficient as possible. Even with the optimal and sometimes specialized -algorithms that have been included the Application Programing Interface (\textit{API}) has been kept as simple as possible. -Often generic place holder routines will make use of specialized algorithms automatically without the developer's -attention. One such example is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use -Karatsuba multiplication if the inputs are of a specific size. - -Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should -be source compatible with another popular library which makes it more attractive for developers to use. In this case the -MPI library was used as a API template for all the basic functions. - -The project is also meant to act as a learning tool for students. The logic being that no easy-to-follow ``bignum'' -library exists which can be used to teach computer science students how to perform fast and reliable multiple precision -arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. Often routines have -more comments than lines of code. - -\section{Choice of LibTomMath} -LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but -for more worthy reasons. Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision -integer arithmetic routines but would not be ideal for this text for reasons as will be explained in the -following sub-sections. - -\subsection{Code Base} -The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional -segments of code littered throughout the source. This clean and uncluttered approach to the library means that a -developer can more readily ascertain the true intent of a given section of source code without trying to keep track of -what conditional code will be used. - -The code base of LibTomMath is also well organized. Each function is in its own separate source code file -which allows the reader to find a given function very fast. When compiled with GCC for the x86 processor the entire -library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}). This includes every single function -LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various -reduction algorithms and Jacobi symbol computation. - -By comparison MPI which has fewer functions than LibTomMath compiled with the same conditions is 45,429 bytes -(\textit{$54,536$ for ARMv4}). GMP which has rather large collection of functions with the default configuration on an -x86 Athlon is 2,950,688 bytes. Note that while LibTomMath has fewer functions than GMP it has been used as the sole basis -for several public key cryptosystems without having to seek additional outside functions to supplement the library. - -\subsection{API Simplicity} -LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build -with LibTomMath without change. The function names are relatively straight forward as to what they perform. Almost all of the -functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing -convention. The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the -student and developer alike. - -The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to -illegible short hand. LibTomMath does not share this fault. - -\subsection{Optimizations} -While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does -feature a set of optimal algorithms for tasks ranging from modular reduction to squaring. GMP and LIP also feature -such optimizations while MPI only uses baseline algorithms with no optimizations. - -LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular -exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually -slower than the best libraries such as GMP and OpenSSL by a small factor. - -\subsection{Portability and Stability} -LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler -(\textit{GCC}). This means that without changes the library will build without configuration or setting up any -variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of -MPI is not working on his library anymore. - -GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active -development and are very stable across a variety of platforms. - -\subsection{Choice} -LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for -the case study of this text. Various source files from the LibTomMath project will be included within the text. However, the -reader is encouraged to download their own copy of the library to actually be able to work with the library. - -\chapter{Getting Started} -MARK,BASICOP -\section{Library Basics} -To begin the design of a multiple precision integer library a primitive data type and a series of primitive algorithms must be established. A data -type that will hold the information required to maintain a multiple precision integer must be designed. With this basic data type of a series -of low level algorithms for initializing, clearing, growing and optimizing multiple precision integers can be developed to form the basis of -the entire library of algorithms. - -\section{What is a Multiple Precision Integer?} -Recall that most programming languages (\textit{in particular C}) only have fixed precision data types that on their own cannot be used -to represent values larger than their precision alone will allow. The purpose of multiple precision algorithms is to use these fixed precision -data types to create multiple precision integers which may represent values that are much larger. - -As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits. In the decimal system -the largest value is only $9$ since the digits may only have values from $0$ to $9$. However, by concatenating digits together larger numbers -may be represented. Computer based multiple precision arithmetic is essentially the same concept except with a different radix. - -What most people probably do not think about explicitly are the various other attributes that describe a multiple precision integer. For example, -the integer $154_{10}$ has two immediately obvious properties. First, the integer is positive, that is the sign of this particular integer -is positive as oppose to negative. Second, the integer has three digits in its representation. There is an additional property that the integer -posesses that does not concern pencil-and-paper arithmetic. The third property is how many digits are allowed for the integer. - -The human analogy of this third property is ensuring there is enough space on the paper to right the integer. Computers must maintain a -strict control on memory usage with respect to the digits of a multiple precision integer. These three properties make up what is known -as a multiple precision integer or mp\_int for short. - -\subsection{The mp\_int structure} -The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer. The ISO C standard does not provide for -any such data type but it does provide for making composite data types known as structures. The following is the structure definition -used within LibTomMath. - -\index{mp\_int} -\begin{verbatim} -typedef struct { - int used, alloc, sign; - mp_digit *dp; -} mp_int; -\end{verbatim} - -The mp\_int structure can be broken down as follows. - -\begin{enumerate} -\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent -a given integer. The \textbf{used} count must not exceed the \textbf{alloc} count. - -\item The array \textbf{dp} holds the digits that represent the given integer. It is padded with $\textbf{alloc} - \textbf{used}$ zero -digits. - -\item The \textbf{alloc} parameter denotes how -many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count -of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the -array to accommodate the precision of the result. - -\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). -\end{enumerate} - -\section{Argument Passing} -A convention of argument passing must be adopted early on in the development of any library. Making the function prototypes -consistent will help eliminate many headaches in the future as the library grows to significant complexity. In LibTomMath the multiple precision -integer functions accept parameters from left to right as pointers to mp\_int structures. That means that the source operands are -placed on the left and the destination on the right. Consider the following examples. - -\begin{verbatim} - mp_mul(&a, &b, &c); /* c = a * b */ - mp_add(&a, &b, &a); /* a = a + b */ - mp_sqr(&a, &b); /* b = a * a */ -\end{verbatim} - -The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the -functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. - -Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around. That is the destination -on the left and arguments on the right. In truth it is entirely a matter of preference. In the case of LibTomMath the -convention from the MPI library has been adopted. - -Another very useful design consideration is whether to allow argument sources to also be a destination. For example, the -second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important feature to implement since it -allows the higher up functions to cut down on the number of variables. However, to implement this feature specific -care has to be given to ensure the destination is not modified before the source is fully read. - -\section{Return Values} -A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the -caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour. In a multiple precision -library the only errors that can occur occur are related to inappropriate inputs (\textit{division by zero for instance}) or -memory allocation errors. - -In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the -following values. - -\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} -\begin{center} -\begin{tabular}{|l|l|} -\hline \textbf{Value} & \textbf{Meaning} \\ -\hline \textbf{MP\_OKAY} & The function was successful \\ -\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ -\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ -\hline -\end{tabular} -\end{center} - -When an error is detected within a function it should free any memory it allocated and return as soon as possible. The goal -is to leave the system in the same state the system was when the function was called. Error checking with this style of API is fairly simple. - -\begin{verbatim} - int err; - if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { - printf("Error: %d\n", err); - exit(EXIT_FAILURE); - } -\end{verbatim} - -The GMP library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal -and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases. - -\section{Initialization and Clearing} -The logical starting point when actually writing multiple precision integer functions is the initialization and -clearing of the integers. These two functions will be used by far the most throughout the algorithms whenever -temporary integers are required. - -Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of -the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even considering -the initial integer will represent zero. If only a single digit were allocated quite a few re-allocations -would occur for the majority of inputs. There is a tradeoff between how many default digits to allocate -and how many re-allocations are tolerable. - -If the memory for the digits has been successfully allocated then the rest of the members of the structure must -be initialized. Since the initial state is to represent a zero integer the digits allocated must all be zeroed. The -\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. - -\subsection{Initializing an mp\_int} -To initialize an mp\_int the mp\_init algorithm shall be used. The purpose of this algorithm is to allocate -the memory required and initialize the integer to a default representation of zero. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Allocate memory for the digits and set to a zero state. \\ -\hline \\ -1. Allocate memory for \textbf{MP\_PREC} digits. \\ -2. If the allocation failed then return(\textit{MP\_MEM}) \\ -3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ -\hspace{3mm}3.1 $a_n \leftarrow 0$\\ -4. $a.sign \leftarrow MP\_ZPOS$\\ -5. $a.used \leftarrow 0$\\ -6. $a.alloc \leftarrow MP\_PREC$\\ -7. Return(\textit{MP\_OKAY})\\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init} -\end{figure} - -\textbf{Algorithm mp\_init.} -The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers. It is ideally at least equal to $32$ but -can be any reasonable power of two. Steps one and two allocate the memory and account for it. If the allocation fails the algorithm returns -immediately to signal the failure. Step three will ensure that all the digits are in the default state of zero. Finally steps -four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure. - -EXAM,bn_mp_init.c - -The \textbf{OPT\_CAST} type cast on line @22,OPT_CAST@ is designed to allow C++ compilers to build the code out of -the box. Microsoft C V5.00 is known to cause problems without the cast. Also note that if the memory -allocation fails the other members of the mp\_int will be in an undefined state. The code from -line @29,a->used@ to line @31,a->sign@ sets the default state for a mp\_int which is zero, positive and no used digits. - -\subsection{Clearing an mp\_int} -When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with -the mp\_clear algorithm. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_clear}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. The memory for $a$ is cleared. \\ -\hline \\ -1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ -2. Free the digits of $a$ and mark $a$ as freed. \\ -3. $a.used \leftarrow 0$ \\ -4. $a.alloc \leftarrow 0$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_clear} -\end{figure} - -\textbf{Algorithm mp\_clear.} -In steps one and two the memory for the digits are only free'd if they had not been previously released before. -This is more of concern for the implementation since it is used to prevent ``double-free'' errors. It also helps catch -code errors where mp\_ints are used after being cleared. Similarly steps three and four set the -\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging. For example, if an mp\_int is expected -to be non-zero and its \textbf{used} member is observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been -spotted. - -EXAM,bn_mp_clear.c - -The \textbf{if} statement on line @21,a->dp != NULL@ prevents the heap from being corrupted if a user double-frees an -mp\_int. For example, a trivial case of this bug would be as follows. - -\begin{verbatim} -mp_int a; -mp_init(&a); -mp_clear(&a); -mp_clear(&a); -\end{verbatim} - -Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C -libraries to cause a fault. Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently -free the mp\_int before it is truly not needed. The allocated digits are set to zero before being freed on line @24,memset@. -This is ideal for cryptographic situations where the mp\_int is a secret parameter. - -The following snippet is an example of using both the init and clear functions. - -\begin{small} -\begin{verbatim} -#include -#include -#include -int main(void) -{ - mp_int num; - int err; - - /* init the bignum */ - if ((err = mp_init(&num)) != MP_OKAY) { - printf("Error: %d\n", err); - return EXIT_FAILURE; - } - - /* do work with it ... */ - - /* clear up */ - mp_clear(&num); - - return EXIT_SUCCESS; -} -\end{verbatim} -\end{small} - -\section{Other Initialization Routines} - -It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms. For example, an -initialization followed by a copy is a common operation when temporary copies of integers are required. It is quite -beneficial to have a series of simple helper functions available. - -\subsection{Initializing Variable Sized mp\_int Structures} -Occasionally the number of digits required will be known in advance of an initialization. In these -cases the mp\_init\_size algorithm can be of use. The purpose of this algorithm is similar to mp\_init except that -it will allocate \textit{at least} a specified number of digits. This is ideal to prevent re-allocations when the -input size is known. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_size}. \\ -\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$\\ -\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ -\hline \\ -1. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ -2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ -3. Allocate $v$ digits. \\ -4. If the allocation failed then return(\textit{MP\_MEM}). \\ -5. for $n$ from $0$ to $v - 1$ do \\ -\hspace{3mm}5.1 $a_n \leftarrow 0$ \\ -6. $a.sign \leftarrow MP\_ZPOS$\\ -7. $a.used \leftarrow 0$\\ -8. $a.alloc \leftarrow v$\\ -9. Return(\textit{MP\_OKAY})\\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_size} -\end{figure} - -\textbf{Algorithm mp\_init\_size.} -The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding. The padding is calculated -to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}. This padding is used to -prevent trivial allocations from becoming a bottleneck in the rest of the algorithms that depend on this. - -EXAM,bn_mp_init_size.c - -Line @23,MP_PREC@ will ensure that the number of digits actually allocated is padded up to the next multiple of -\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}. This ensures that the number of allocated digit is -always greater than the amount requested. As a result it prevents many trivial memory allocations. The value of -\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two. - -\subsection{Creating a Clone} -Another common sequence of operations is to make a local temporary copy of an argument. To initialize then copy a mp\_int will be known as -creating a clone. This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy. -The mp\_init\_copy algorithm will perform this very task. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_copy}. \\ -\textbf{Input}. An mp\_int $a$ and $b$\\ -\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ -\hline \\ -1. Init $a$. (\textit{mp\_init}) \\ -2. If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\ -3. Copy $b$ to $a$. (\textit{mp\_copy}) \\ -4. Return the status of the copy operation. \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_copy} -\end{figure} - -\textbf{Algorithm mp\_init\_copy.} -This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it. The algorithm will -detect when the initialization fails and returns the error to the calling algorithm. As such this algorithm will perform two operations -in one step. - -EXAM,bn_mp_init_copy.c - -This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that -\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call -and \textbf{a} will be left intact. - -\subsection{Multiple Integer Initializations And Clearings} -Occasionally a function will require a series of mp\_int data types to be made available. The mp\_init\_multi algorithm -is provided to simplify such cases. The purpose of this algorithm is to initialize a variable length array of mp\_int -structures at once. As a result algorithms that require multiple integers only has to use -one algorithm to initialize all the mp\_int variables. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_multi}. \\ -\textbf{Input}. Variable length array of mp\_int variables of length $k$. \\ -\textbf{Output}. The array is initialized such that each each mp\_int is ready to use. \\ -\hline \\ -1. for $n$ from 0 to $k - 1$ do \\ -\hspace{+3mm}1.1. Initialize the $n$'th mp\_int (\textit{mp\_init}) \\ -\hspace{+3mm}1.2. If initialization failed then do \\ -\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ -\hspace{+9mm}1.2.1.1. Free the $j$'th mp\_int (\textit{mp\_clear}) \\ -\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ -2. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_multi} -\end{figure} - -\textbf{Algorithm mp\_init\_multi.} -The algorithm will initialize the array of mp\_int variables one at a time. As soon as an runtime error is detected (\textit{step 1.2}) all of -the previously initialized variables are cleared. The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime -errors. - -Similarly to clear a variable length array of mp\_int structures the mp\_clear\_multi algorithm will be used. - -Consider the following snippet which demonstrates how to use both routines. -\begin{small} -\begin{verbatim} -#include -#include -#include -int main(void) -{ - mp_int num1, num2, num3; - int err; - - if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) { - printf("Error: %d\n", err); - return EXIT_FAILURE; - } - - /* at this point num1/num2/num3 are ready */ - - /* free them */ - mp_clear_multi(&num1, &num2, &num3, NULL); - - return EXIT_SUCCESS; -} -\end{verbatim} -\end{small} - -Note how both lists are terminated with the \textbf{NULL} variable. This indicates to the algorithms to stop fetching parameters off -of the stack. If it is not present the functions will most likely cause a segmentation fault. - -EXAM,bn_mp_multi.c - -Both routines are implemented in the same source file since they are typically used in conjunction with each other. - -\section{Maintenance} -A small useful collection of mp\_int maintenance functions will also prove useful. - -\subsection{Augmenting Integer Precision} -When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without -loss of precision. Quite often the size of the array given by the \textbf{alloc} member is large enough to simply -increase the \textbf{used} digit count. However, when the size of the array is too small it must be re-sized -appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_grow}. \\ -\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ -\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ -\hline \\ -1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ -2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ -3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ -4. Re-Allocate the array of digits $a$ to size $v$ \\ -5. If the allocation failed then return(\textit{MP\_MEM}). \\ -6. for n from a.alloc to $v - 1$ do \\ -\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ -7. $a.alloc \leftarrow v$ \\ -8. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_grow} -\end{figure} - -\textbf{Algorithm mp\_grow.} -Step one will prevent a re-allocation from being performed if it was not required. This is useful to prevent mp\_ints -from growing excessively in code that erroneously calls mp\_grow. Similar to mp\_init\_size the requested digit count -is padded to provide more digits than requested. - -In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact. This is much akin to how the -\textit{realloc} function from the standard C library works. Since the newly allocated digits are assumed to contain -undefined values they are also initially zeroed. - -EXAM,bn_mp_grow.c - -The first step is to see if we actually need to perform a re-allocation at all. This is tested for on line -@24,a->alloc < size@. Similar to mp\_init\_size the same code on line @26,MP_PREC - 1@ was used to resize the -digits requested. A simple for loop from line @34,a->alloc@ to line @38,}@ will zero all digits that were above the -old \textbf{alloc} limit to make sure the integer is in a known state. - -\subsection{Clamping Excess Digits} -When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of -the function. For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most -$i + j$ digits. It is entirely possible that the result is $i + j - 1$ though, with no final carry into the last -position. However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j - 1$ -digits than further expanded to accomodate the final carry. That would be a considerable waste of time since heap -operations are relatively slow. - -The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function -terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked -there would be an excess high order zero digit. - -For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit -will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would -accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very -low the representation is excessively large. - -The mp\_clamp algorithm is designed to solve this very problem. It will trim leading zeros by decrementing the -\textbf{used} count until a non-zero leading digit is found. Also in this system, zero is considered to be a positive -number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_clamp}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ -\hline \\ -1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ -\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ -2. if $a.used = 0$ then do \\ -\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ -\hline \\ -\end{tabular} -\end{center} -\caption{Algorithm mp\_clamp} -\end{figure} - -\textbf{Algorithm mp\_clamp.} -As can be expected this algorithm is very simple. The loop on step one is expected to iterate only once or twice at -the most. For example, this will happen in cases where there is not a carry to fill the last position. Step two fixes the sign for -when all of the digits are zero to ensure that the mp\_int is valid at all times. - -EXAM,bn_mp_clamp.c - -Note on line @27,while@ how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming -language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is -important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously -undesirable. The parenthesis on line @28,a->used@ is used to make sure the \textbf{used} count is decremented and not -the pointer ``a''. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ - & \\ -$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ - & \\ -$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ - & encryption when $\beta = 2^{28}$. \\ - & \\ -$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ - & \\ -$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ - & \\ -\end{tabular} - - -\chapter{Basic Operations} -\section{Copying an Integer} -After the various house-keeping routines are in place, simple algorithms can be designed to take advantage of them. Being able -to make a verbatim copy of an integer is a very useful function to have. To copy an integer the mp\_copy algorithm will be used. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_copy}. \\ -\textbf{Input}. An mp\_int $a$ and $b$. \\ -\textbf{Output}. Store a copy of $a$ in $b$. \\ -\hline \\ -1. Check if $a$ and $b$ point to the same location in memory. \\ -2. If true then return(\textit{MP\_OKAY}). \\ -3. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{mp\_grow}) \\ -4. If failed to grow then return(\textit{MP\_MEM}). \\ -5. for $n$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}5.1 $b_{n} \leftarrow a_{n}$ \\ -6. if $a.used < b.used - 1$ then \\ -\hspace{3mm}6.1. for $n$ from $a.used$ to $b.used - 1$ do \\ -\hspace{6mm}6.1.1 $b_{n} \leftarrow 0$ \\ -7. $b.used \leftarrow a.used$ \\ -8. $b.sign \leftarrow a.sign$ \\ -9. return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_copy} -\end{figure} - -\textbf{Algorithm mp\_copy.} -Step 1 and 2 make sure that the two mp\_ints are unique. This allows the user to call the copy function with -potentially the same input and not waste time. Step 3 and 4 ensure that the destination is large enough to -hold a copy of the input $a$. Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used} -member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller. This -prevents trivial memory reallocations. - -Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$, -the more significant digits of $b$ will be zeroed. Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over -which completes the copy operation. - -EXAM,bn_mp_copy.c - -Source lines @23,if dst ==@-@31,}@ do the initial house keeping. That is to see if the input is unique and if so to -make sure there is enough room. If not enough space is available it returns the error and leaves the destination variable -intact. - -The inner loop of the copy operation is contained between lines @34,{@ and @50,}@. Many LibTomMath routines are designed with this source code style -in mind, making aliases to shorten lengthy pointers (\textit{see line @38,->@ and @39,->@}) for rapid use. Also the -use of nested braces creates a simple way to denote various portions of code that reside on various work levels. Here, the copy loop is at the -$O(n)$ level. - -\section{Zeroing an Integer} -Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to -perform this task. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_zero}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Zero the contents of $a$ \\ -\hline \\ -1. $a.used \leftarrow 0$ \\ -2. $a.sign \leftarrow$ MP\_ZPOS \\ -3. for $n$ from 0 to $a.alloc - 1$ do \\ -\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_zero} -\end{figure} - -\textbf{Algorithm mp\_zero.} -This algorithm simply resets a mp\_int to the default state. - -EXAM,bn_mp_zero.c - -After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the -\textbf{sign} variable is set to \textbf{MP\_ZPOS}. - -\section{Sign Manipulation} -\subsection{Absolute Value} -With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute -the absolute value of an mp\_int. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_abs}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Computes $b = \vert a \vert$ \\ -\hline \\ -1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ -2. If the copy failed return(\textit{MP\_MEM}). \\ -3. $b.sign \leftarrow MP\_ZPOS$ \\ -4. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_abs} -\end{figure} - -\textbf{Algorithm mp\_abs.} -This algorithm computes the absolute of an mp\_int input. As can be expected the algorithm is very trivial. - -EXAM,bn_mp_abs.c - -\subsection{Integer Negation} -With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute -the negative of an mp\_int input. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_neg}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Computes $b = -a$ \\ -\hline \\ -1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ -2. If the copy failed return(\textit{MP\_MEM}). \\ -3. If $a.sign = MP\_ZPOS$ then do \\ -\hspace{3mm}3.1 $b.sign = MP\_NEG$. \\ -4. else do \\ -\hspace{3mm}4.1 $b.sign = MP\_ZPOS$. \\ -5. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_neg} -\end{figure} - -\textbf{Algorithm mp\_neg.} -This algorithm computes the negation of an input. - -EXAM,bn_mp_neg.c - -\section{Small Constants} -\subsection{Setting Small Constants} -Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_set}. \\ -\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ -\textbf{Output}. Make $a$ equivalent to $b$ \\ -\hline \\ -1. Zero $a$ (\textit{mp\_zero}). \\ -2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ -3. $a.used \leftarrow \left \lbrace \begin{array}{ll} - 1 & \mbox{if }a_0 > 0 \\ - 0 & \mbox{if }a_0 = 0 - \end{array} \right .$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_set} -\end{figure} - -\textbf{Algorithm mp\_set.} -This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The -single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. - -EXAM,bn_mp_set.c - -Line @21,mp_zero@ calls mp\_zero() to clear the mp\_int and reset the sign. Line @22,MP_MASK@ copies the digit -into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly -reduce an integer modulo $\beta$. Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with -$MP\_MASK = 2^k - 1$ to perform the reduction. Finally line @23,a->used@ will set the \textbf{used} member with respect to the -digit actually set. This function will always make the integer positive. - -One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses -this function should take that into account. Meaning that only trivially small constants can be set using this function. - -\subsection{Setting Large Constants} -To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided. It accepts a ``long'' -data type as input and will always treat it as a 32-bit integer. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_set\_int}. \\ -\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ -\textbf{Output}. Make $a$ equivalent to $b$ \\ -\hline \\ -1. Zero $a$ (\textit{mp\_zero}) \\ -2. for $n$ from 0 to 7 do \\ -\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\ -\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ -\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ -\hspace{3mm}2.4 $a.used \leftarrow a.used + 1$ \\ -3. Clamp excess used digits (\textit{mp\_clamp}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_set\_int} -\end{figure} - -\textbf{Algorithm mp\_set\_int.} -The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the -mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions. In step 2.2 the -next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is -incremented to reflect the addition. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have -zero digits used and the newly added four bits would be ignored. - -Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. - -EXAM,bn_mp_set_int.c - -This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird -addition on line @38,a->used@ ensures that the newly added in bits are added to the number of digits. While it may not -seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line @27,mp_mul_2d@ -as well as the call to mp\_clamp() on line @40,mp_clamp@. Both functions will clamp excess leading digits which keeps -the number of used digits low. - -\section{Comparisons} -\subsection{Unsigned Comparisions} -Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, -to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ -to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude -positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. - -The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two -mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the -signs are known to agree in advance. - -To facilitate working with the results of the comparison functions three constants are required. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{|r|l|} -\hline \textbf{Constant} & \textbf{Meaning} \\ -\hline \textbf{MP\_GT} & Greater Than \\ -\hline \textbf{MP\_EQ} & Equal To \\ -\hline \textbf{MP\_LT} & Less Than \\ -\hline -\end{tabular} -\end{center} -\caption{Comparison Return Codes} -\end{figure} - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_cmp\_mag}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$. \\ -\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ -\hline \\ -1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ -2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ -3. for n from $a.used - 1$ to 0 do \\ -\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ -\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ -4. Return(\textit{MP\_EQ}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_cmp\_mag} -\end{figure} - -\textbf{Algorithm mp\_cmp\_mag.} -By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return -\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. -Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. -If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. - -By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to -the zero'th digit. If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}. - -EXAM,bn_mp_cmp_mag.c - -The two if statements on lines @24,if@ and @28,if@ compare the number of digits in the two inputs. These two are performed before all of the digits -are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid -without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the -array of digits. - -\subsection{Signed Comparisons} -Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude -comparison a trivial signed comparison algorithm can be written. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_cmp}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ -\hline \\ -1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ -2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ -3. if $a.sign = MP\_NEG$ then \\ -\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\ -4 Otherwise \\ -\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_cmp} -\end{figure} - -\textbf{Algorithm mp\_cmp.} -The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate -comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step -three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then -$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. - -EXAM,bn_mp_cmp.c - -The two if statements on lines @22,if@ and @26,if@ perform the initial sign comparison. If the signs are not the equal then which ever -has the positive sign is larger. At line @30,if@, the inputs are compared based on magnitudes. If the signs were both negative then -the unsigned comparison is performed in the opposite direction (\textit{line @31,mp_cmp_mag@}). Otherwise, the signs are assumed to -be both positive and a forward direction unsigned comparison is performed. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ - & \\ -$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ - & of two random digits (of equal magnitude) before a difference is found. \\ - & \\ -$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ - & on the observations made in the previous problem. \\ - & -\end{tabular} - -\chapter{Basic Arithmetic} -\section{Building Blocks} -At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been -established. The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms. These -algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms. It is very important -that these algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms -which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. - -MARK,SHIFTS -All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right -logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real -number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}). -Mathematically a logical shift is equivalent to a division or multiplication by a power of two. -For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. - -One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed -from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the -result is $110_2$. - -\section{Addition and Subtraction} -In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers -$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. -As a result subtraction can be performed with a trivial series of logical operations and an addition. - -However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the -sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or -subtraction algorithms with the sign fixed up appropriately. - -The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of -the integers respectively. - -\subsection{Low Level Addition} -An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the -trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. -Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. - -\newpage -\begin{figure}[!here] -\begin{center} -\begin{small} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_add}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ -\hline \\ -1. if $a.used > b.used$ then \\ -\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ -\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ -\hspace{+3mm}1.3 $x \leftarrow a$ \\ -2. else \\ -\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ -\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ -\hspace{+3mm}2.3 $x \leftarrow b$ \\ -3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\ -4. If failed to grow $c$ return(\textit{MP\_MEM}) \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow max + 1$ \\ -7. $u \leftarrow 0$ \\ -8. for $n$ from $0$ to $min - 1$ do \\ -\hspace{+3mm}8.1 $c_n \leftarrow a_n + b_n + u$ \\ -\hspace{+3mm}8.2 $u \leftarrow c_n >> lg(\beta)$ \\ -\hspace{+3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -9. if $min \ne max$ then do \\ -\hspace{+3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ -\hspace{+6mm}9.1.1 $c_n \leftarrow x_n + u$ \\ -\hspace{+6mm}9.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ -\hspace{+6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -10. $c_{max} \leftarrow u$ \\ -11. if $olduse > max$ then \\ -\hspace{+3mm}11.1 for $n$ from $max + 1$ to $olduse - 1$ do \\ -\hspace{+6mm}11.1.1 $c_n \leftarrow 0$ \\ -12. Clamp excess digits in $c$. (\textit{mp\_clamp}) \\ -13. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Algorithm s\_mp\_add} -\end{figure} - -\textbf{Algorithm s\_mp\_add.} -This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. -Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}. Even the -MIX pseudo machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. - -Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count. This allows the inputs to have varying magnitudes which not -only makes it more efficient than the trivial algorithm presented in the references but more flexible. The variable $min$ is given the lowest -digit count while $max$ is given the highest digit count. If both inputs have the same \textbf{used} digit count both $min$ and $max$ are -set to the same value. The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it. After the inputs are sorted, -steps 3 and 4 will ensure that the destination $c$ can accommodate the result. The old \textbf{used} count from $c$ is copied to -$oldused$ so that excess digits can be cleared later, and the new \textbf{used} count is set to $max+1$, so that a carry from the most significant -word can be handled. - -At step 7 the carry variable $u$ is set to zero and the first part of the addition loop can begin. The first step of the loop (\textit{8.1}) adds -digits from the two inputs together along with the carry variable $u$. The following step extracts the carry bit by shifting the result of the -preceding step right by $lg(\beta)$ positions. The shift to extract the carry is similar to how carry extraction works with decimal addition. - -Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$. The trailing digit of the result -is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$. The -division and multiplication of $10$ is simply a logical right or left shift, respectively, of the digits. In otherwords the carry can be extracted -by shifting one digit to the right. - -Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$. This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ -digit. Therefore, a logical shift right of the summand by $lg(\beta)$ will extract the carry. The final step of the loop reduces the digit -modulo the radix $\beta$ to ensure it is in range. - -After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted. Step 9 decides whether -the inputs were of equal magnitude. If not than another loop similar to that in step 8, must be executed. The loop at step -number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry. - -Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$. Step 11 ensures that -leading digits that were originally present in $c$ are cleared. Finally excess leading digits are clamped and the algorithm returns success. - -EXAM,bn_s_mp_add.c - -Lines @27,if@ to @35,}@ perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is a pointer to a -mp\_int assigned to the largest input, in effect it is a local alias. Lines @37,init@ to @42,}@ ensure that the destination is grown to -accomodate the result of the addition. - -Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases that are on -lines @56,tmpa@, @59,tmpb@ and @62,tmpc@ represent the two inputs and destination variables respectively. These aliases are used to ensure the -compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. - -The initial carry $u$ is cleared on line @65,u = 0@, note that $u$ is of type mp\_digit which ensures type compatibility within the -implementation. The initial addition loop begins on line @66,for@ and ends on line @75,}@. Similarly the conditional addition loop -begins on line @81,for@ and ends on line @90,}@. The addition is finished with the final carry being stored in $tmpc$ on line @94,tmpc++@. -Note the ``++'' operator on the same line. After line @94,tmpc++@ $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful -for the next loop on lines @97,for@ to @99,}@ which set any old upper digits to zero. - -\subsection{Low Level Subtraction} -The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the -unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must -be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. -This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. - -MARK,GAMMA - -For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent -the range $0 \le x < 2\beta$ for the algorithms to work correctly. However, it is allowable that a mp\_digit represent a larger range of values. For -this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a -mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). - -For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$. In ISO C an ``unsigned long'' -data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$. - -\newpage\begin{figure}[!here] -\begin{center} -\begin{small} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_sub}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ -\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ -\hline \\ -1. $min \leftarrow b.used$ \\ -2. $max \leftarrow a.used$ \\ -3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{mp\_grow}) \\ -4. If the reallocation failed return(\textit{MP\_MEM}). \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow max$ \\ -7. $u \leftarrow 0$ \\ -8. for $n$ from $0$ to $min - 1$ do \\ -\hspace{3mm}8.1 $c_n \leftarrow a_n - b_n - u$ \\ -\hspace{3mm}8.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ -\hspace{3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -9. if $min < max$ then do \\ -\hspace{3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ -\hspace{6mm}9.1.1 $c_n \leftarrow a_n - u$ \\ -\hspace{6mm}9.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ -\hspace{6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -10. if $oldused > max$ then do \\ -\hspace{3mm}10.1 for $n$ from $max$ to $oldused - 1$ do \\ -\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ -11. Clamp excess digits of $c$. (\textit{mp\_clamp}). \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Algorithm s\_mp\_sub} -\end{figure} - -\textbf{Algorithm s\_mp\_sub.} -This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when -passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This -algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case -of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. - -The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 -set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at -most $max$ digits in length as opposed to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and -set to the maximal count for the operation. - -The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision -subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction -loops. Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry. - -For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$. The least significant bit will force a carry upwards to -the third bit which will be set to zero after the borrow. After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the -third bit of $0101_2$ is subtracted from the result it will cause another carry. In this case though the carry will be forced to propagate all the -way to the most significant bit. - -Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most -significant bit. Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that -is needed is a single zero or one bit for the carry. Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the -carry. This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed. - -If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step -10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. - -EXAM,bn_s_mp_sub.c - -Line @24,min@ and @25,max@ perform the initial hardcoded sorting of the inputs. In reality the $min$ and $max$ variables are only aliases and are only -used to make the source code easier to read. Again the pointer alias optimization is used within this algorithm. Lines @42,tmpa@, @43,tmpb@ and @44,tmpc@ initialize the aliases for -$a$, $b$ and $c$ respectively. - -The first subtraction loop occurs on lines @47,u = 0@ through @61,}@. The theory behind the subtraction loop is exactly the same as that for -the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry -(\textit{see line @57, >>@}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND -the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry -occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply -shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on -twos compliment machines which is a safe assumption to make. - -If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines @64,for@ through @73,}@}) is required to propagate the carry through -$a$ and copy the result to $c$. - -\subsection{High Level Addition} -Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be -established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data -types. - -Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} -flag. A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases. - -\begin{figure}[!here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_add}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The signed addition $c = a + b$. \\ -\hline \\ -1. if $a.sign = b.sign$ then do \\ -\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\ -2. else do \\ -\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ -\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ -\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ -\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ -3. If any of the lower level operations failed return(\textit{MP\_MEM}) \\ -4. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_add} -\end{figure} - -\textbf{Algorithm mp\_add.} -This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from either \cite{TAOCPV2} or -\cite{HAC} since they both only provide unsigned operations. The algorithm is fairly straightforward but restricted since subtraction can only -produce positive results. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|} -\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ -\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ -\hline &&&&\\ - -\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ -\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ - -\hline &&&&\\ - -\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ - -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Addition Guide Chart} -\label{fig:AddChart} -\end{figure} - -Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled. The -return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors. This simplifies the description -of the algorithm considerably and best follows how the implementation actually was achieved. - -Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms -s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} -to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. - -For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would -produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp -within algorithm s\_mp\_add will force $-0$ to become $0$. - -EXAM,bn_mp_add.c - -The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which -is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without -explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower -level functions do so. Returning their return code is sufficient. - -\subsection{High Level Subtraction} -The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. - -\newpage\begin{figure}[!here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_sub}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The signed subtraction $c = a - b$. \\ -\hline \\ -1. if $a.sign \ne b.sign$ then do \\ -\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\ -2. else do \\ -\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ -\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} - MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ - MP\_NEG & \mbox{otherwise} \\ - \end{array} \right .$ \\ -\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ -3. If any of the lower level operations failed return(\textit{MP\_MEM}). \\ -4. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_sub} -\end{figure} - -\textbf{Algorithm mp\_sub.} -This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or -\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. The following chart lists the eight possible inputs and -the operations required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|} -\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ -\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ -\hline &&&& \\ -\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline &&&& \\ -\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ -\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Subtraction Guide Chart} -\end{figure} - -Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the -algorithm from producing $-a - -a = -0$ as a result. - -EXAM,bn_mp_sub.c - -Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations -and forward it to the end of the function. On line @38, != MP_LT@ the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a -``greater than or equal to'' comparison. - -\section{Bit and Digit Shifting} -MARK,POLY -It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. -This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. - -In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift -the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations -are on radix-$\beta$ digits. - -\subsection{Multiplication by Two} - -In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient -operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul\_2}. \\ -\textbf{Input}. One mp\_int $a$ \\ -\textbf{Output}. $b = 2a$. \\ -\hline \\ -1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{mp\_grow}) \\ -2. If the reallocation failed return(\textit{MP\_MEM}). \\ -3. $oldused \leftarrow b.used$ \\ -4. $b.used \leftarrow a.used$ \\ -5. $r \leftarrow 0$ \\ -6. for $n$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}6.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ -\hspace{3mm}6.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}6.3 $r \leftarrow rr$ \\ -7. If $r \ne 0$ then do \\ -\hspace{3mm}7.1 $b_{n + 1} \leftarrow r$ \\ -\hspace{3mm}7.2 $b.used \leftarrow b.used + 1$ \\ -8. If $b.used < oldused - 1$ then do \\ -\hspace{3mm}8.1 for $n$ from $b.used$ to $oldused - 1$ do \\ -\hspace{6mm}8.1.1 $b_n \leftarrow 0$ \\ -9. $b.sign \leftarrow a.sign$ \\ -10. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul\_2} -\end{figure} - -\textbf{Algorithm mp\_mul\_2.} -This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such -an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since -it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. - -Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count -is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. - -Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together -are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to -obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus -the previous carry. Recall from ~SHIFTS~ that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with -forwarding the carry to the next iteration. - -Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$. -Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$. - -EXAM,bn_mp_mul_2.c - -This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference -is the use of the logical shift operator on line @52,<<@ to perform a single precision doubling. - -\subsection{Division by Two} -A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_div\_2}. \\ -\textbf{Input}. One mp\_int $a$ \\ -\textbf{Output}. $b = a/2$. \\ -\hline \\ -1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{mp\_grow}) \\ -2. If the reallocation failed return(\textit{MP\_MEM}). \\ -3. $oldused \leftarrow b.used$ \\ -4. $b.used \leftarrow a.used$ \\ -5. $r \leftarrow 0$ \\ -6. for $n$ from $b.used - 1$ to $0$ do \\ -\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ -\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}6.3 $r \leftarrow rr$ \\ -7. If $b.used < oldused - 1$ then do \\ -\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ -\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ -8. $b.sign \leftarrow a.sign$ \\ -9. Clamp excess digits of $b$. (\textit{mp\_clamp}) \\ -10. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_div\_2} -\end{figure} - -\textbf{Algorithm mp\_div\_2.} -This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition -core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm -could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent -reading past the end of the array of digits. - -Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the -least significant bit not the most significant bit. - -EXAM,bn_mp_div_2.c - -\section{Polynomial Basis Operations} -Recall from ~POLY~ that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as -the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single -place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer -division and Karatsuba multiplication. - -Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that -$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the -polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. - -\subsection{Multiplication by $x$} - -Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one -degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to -multiplying by the integer $\beta$. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_lshd}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\ -\hline \\ -1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ -2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{mp\_grow}). \\ -3. If the reallocation failed return(\textit{MP\_MEM}). \\ -4. $a.used \leftarrow a.used + b$ \\ -5. $i \leftarrow a.used - 1$ \\ -6. $j \leftarrow a.used - 1 - b$ \\ -7. for $n$ from $a.used - 1$ to $b$ do \\ -\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ -\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ -\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ -8. for $n$ from 0 to $b - 1$ do \\ -\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_lshd} -\end{figure} - -\textbf{Algorithm mp\_lshd.} -This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs -from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location. The -motivation behind this change is due to the way this function is typically used. Algorithms such as mp\_add store the result in an optionally -different third mp\_int because the original inputs are often still required. Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is -typically used on values where the original value is no longer required. The algorithm will return success immediately if -$b \le 0$ since the rest of algorithm is only valid when $b > 0$. - -First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over -the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). -The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on -step 8 sets the lower $b$ digits to zero. - -\newpage -FIGU,sliding_window,Sliding Window Movement - -EXAM,bn_mp_lshd.c - -The if statement on line @24,if@ ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before -the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $top$ on line @42,top@ is an alias -for the leading digit while $bottom$ on line @45,bottom@ is an alias for the trailing edge. The aliases form a window of exactly $b$ digits -over the input. - -\subsection{Division by $x$} - -Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_rshd}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ -\hline \\ -1. If $b \le 0$ then return. \\ -2. If $a.used \le b$ then do \\ -\hspace{3mm}2.1 Zero $a$. (\textit{mp\_zero}). \\ -\hspace{3mm}2.2 Return. \\ -3. $i \leftarrow 0$ \\ -4. $j \leftarrow b$ \\ -5. for $n$ from 0 to $a.used - b - 1$ do \\ -\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ -\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ -\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ -6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ -\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ -7. $a.used \leftarrow a.used - b$ \\ -8. Return. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_rshd} -\end{figure} - -\textbf{Algorithm mp\_rshd.} -This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since -it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. - -If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal -to the shift count $b$ then it will simply zero the input and return. - -After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that -is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. -Also the digits are copied from the leading to the trailing edge. - -Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented. - -EXAM,bn_mp_rshd.c - -The only noteworthy element of this routine is the lack of a return type. - --- Will update later to give it a return type...Tom - -\section{Powers of Two} - -Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For -example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single -shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. - -\subsection{Multiplication by Power of Two} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ -\hline \\ -1. $c \leftarrow a$. (\textit{mp\_copy}) \\ -2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ -3. If the reallocation failed return(\textit{MP\_MEM}). \\ -4. If $b \ge lg(\beta)$ then \\ -\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\ -\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ -5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -6. If $d \ne 0$ then do \\ -\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ -\hspace{3mm}6.2 $r \leftarrow 0$ \\ -\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ -\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ -\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ -\hspace{3mm}6.4 If $r > 0$ then do \\ -\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ -\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ -7. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul\_2d} -\end{figure} - -\textbf{Algorithm mp\_mul\_2d.} -This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to -quickly compute the product. - -First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than -$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ -left. - -After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts -required. If it is non-zero a modified shift loop is used to calculate the remaining product. -Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ -variable is used to extract the upper $d$ bits to form the carry for the next iteration. - -This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to -complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. - -EXAM,bn_mp_mul_2d.c - -Notes to be revised when code is updated. -- Tom - -\subsection{Division by Power of Two} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_div\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ -\hline \\ -1. If $b \le 0$ then do \\ -\hspace{3mm}1.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ -\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{mp\_zero}) \\ -\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ -2. $c \leftarrow a$ \\ -3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -4. If $b \ge lg(\beta)$ then do \\ -\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\ -5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -6. If $k \ne 0$ then do \\ -\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ -\hspace{3mm}6.2 $r \leftarrow 0$ \\ -\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ -\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ -\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ -\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ -7. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ -8. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_div\_2d} -\end{figure} - -\textbf{Algorithm mp\_div\_2d.} -This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm -mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division -by using algorithm mp\_mod\_2d. - -EXAM,bn_mp_div_2d.c - -The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally -ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the -result of the remainder operation until the end. This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before -the quotient is obtained. - -The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). - -\subsection{Remainder of Division by Power of Two} - -The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This -algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mod\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ -\hline \\ -1. If $b \le 0$ then do \\ -\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{mp\_zero}) \\ -\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ -2. If $b > a.used \cdot lg(\beta)$ then do \\ -\hspace{3mm}2.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ -\hspace{3mm}2.2 Return the result of step 2.1. \\ -3. $c \leftarrow a$ \\ -4. If step 3 failed return(\textit{MP\_MEM}). \\ -5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ -\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ -6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ -8. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mod\_2d} -\end{figure} - -\textbf{Algorithm mp\_mod\_2d.} -This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the -result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ -is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. - -EXAM,bn_mp_mod_2d.c - --- Add comments later, Tom. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ - & in $O(n)$ time. \\ - &\\ -$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ - & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ - & upto $64$ with a hamming weight less than three. \\ - &\\ -$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ - & $2^k - 1$ as well. \\ - &\\ -$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ - & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ - & any $n$-bit input. Note that the time of addition is ignored in the \\ - & calculation. \\ - & \\ -$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ - & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ - & the cost of addition. \\ - & \\ -$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ - & for $n = 64 \ldots 1024$ in steps of $64$. \\ - & \\ -$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ - & calculating the result of a signed comparison. \\ - & -\end{tabular} - -\chapter{Multiplication and Squaring} -\section{The Multipliers} -For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of -algorithms of any multiple precision integer package. The set of multiplier algorithms include integer multiplication, squaring and modular reduction -where in each of the algorithms single precision multiplication is the dominant operation performed. This chapter will discuss integer multiplication -and squaring, leaving modular reductions for the subsequent chapter. - -The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular -exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. During a modular -exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, -35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision -multiplications. - -For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied -against every digit of the other multiplicand. Traditional long-hand multiplication is based on this process; while the techniques can differ the -overall algorithm used is essentially the same. Only ``recently'' have faster algorithms been studied. First Karatsuba multiplication was discovered in -1962. This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach. -This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions. - -\section{Multiplication} -\subsection{The Baseline Multiplication} -\index{baseline multiplication} -Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication -algorithm that school children are taught. The algorithm is considered an $O(n^2)$ algoritn since for two $n$-digit inputs $n^2$ single precision -multiplications are required. More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required. To -simplify most discussions, it will be assumed that the inputs have comparable number of digits. - -The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be -used. This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible. One important -facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution. The importance of this -modification will become evident during the discussion of Barrett modular reduction. Recall that for a $n$ and $m$ digit input the product -will be at most $n + m$ digits. Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product. - -Recall from ~GAMMA~ the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend the variable set to -include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The -constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see ~COMBA~ for more information}). - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ -\hline \\ -1. If min$(a.used, b.used) < \delta$ then do \\ -\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}). \\ -\hspace{3mm}1.2 Return the result of step 1.1 \\ -\\ -Allocate and initialize a temporary mp\_int. \\ -2. Init $t$ to be of size $digs$ \\ -3. If step 2 failed return(\textit{MP\_MEM}). \\ -4. $t.used \leftarrow digs$ \\ -\\ -Compute the product. \\ -5. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}5.1 $u \leftarrow 0$ \\ -\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ -\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ -\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ -\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ -\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}5.5 if $ix + pb < digs$ then do \\ -\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ -6. Clamp excess digits of $t$. \\ -7. Swap $c$ with $t$ \\ -8. Clear $t$ \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_mul\_digs} -\end{figure} - -\textbf{Algorithm s\_mp\_mul\_digs.} -This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits. While it may seem -a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent -algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}. -Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the -inputs. - -The first thing this algorithm checks for is whether a Comba multiplier can be used instead. If the minimum digit count of either -input is less than $\delta$, then the Comba method may be used instead. After the Comba method is ruled out, the baseline algorithm begins. A -temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to -compute products when either $a = c$ or $b = c$ without overwriting the inputs. - -All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable -is given the count of digits to read from $b$ inside the nested loop. If $pb \le 1$ then no more output digits can be produced and the algorithm -will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplications. That is, in each pass of the -innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. - -For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best -visualized in the following table. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{|c|c|c|c|c|c|l|} -\hline && & 5 & 7 & 6 & \\ -\hline $\times$&& & 2 & 4 & 1 & \\ -\hline &&&&&&\\ - && & 5 & 7 & 6 & $10^0(1)(576)$ \\ - &2 & 3 & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\ - 1 & 3 & 8 & 8 & 1 & 6 & $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\ -\hline -\end{tabular} -\end{center} -\caption{Long-Hand Multiplication Diagram} -\end{figure} - -Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate -count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. - -Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable. The multiplication on that step -is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a -double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step -5.4.1 is propagated through the nested loop. If the carry was not propagated immediately it would overflow the single precision digit -$t_{ix+iy}$ and the result would be lost. - -At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. The carry does not have to be added to the $ix+pb$'th -digit since that digit is assumed to be zero at this point. However, if $ix + pb \ge digs$ the carry is not set as it would make the result -exceed the precision requested. - -EXAM,bn_s_mp_mul_digs.c - -Lines @31,if@ to @35,}@ determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and -the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control -the stack usage in the Comba routines. By default it is set to $\delta$ but can be reduced when memory is at a premium. - -Of particular importance is the calculation of the $ix+iy$'th column on lines @64,mp_word@, @65,mp_word@ and @66,mp_word@. Note how all of the -variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$. That is to ensure that double precision operations -are used instead of single precision. The multiplication on line @65,) * (@ makes use of a specific GCC optimizer behaviour. On the outset it looks like -the compiler will have to use a double precision multiplication to produce the result required. Such an operation would be horribly slow on most -processors and drag this to a crawl. However, GCC is smart enough to realize that double wide output single precision multipliers can be used. For -example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result. - -\subsection{Faster Multiplication by the ``Comba'' Method} -MARK,COMBA - -One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This -makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' \cite{COMBA} method is named after little known -(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested -carry fixup operations. As an interesting aside it seems that Paul Barrett describes a similar technique in -his 1986 paper \cite{BARRETT} written five years before. - -At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight twist is placed on how -the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the -final result. In the baseline algorithm the columns are added together after each iteration to get the result instantaneously. - -In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a -simple multiplication and addition step is performed. The carries of the columns are propagated after the nested loop to reduce the amount -of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. - -\begin{equation} -\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace -\end{equation} - -Where $\vec x_n$ is the $n'th$ column of the output vector. Consider the following example which computes the vector $\vec x$ for the multiplication -of $576$ and $241$. - -\newpage\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|c|} - \hline & & 5 & 7 & 6 & First Input\\ - \hline $\times$ & & 2 & 4 & 1 & Second Input\\ -\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ - & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ - $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ -\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Comba Multiplication Diagram} -\end{figure} - -At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. -Now the columns must be fixed by propagating the carry upwards. The resultant vector will have one extra dimension over the input vector which is -congruent to adding a leading zero digit. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Comba Fixup}. \\ -\textbf{Input}. Vector $\vec x$ of dimension $k$ \\ -\textbf{Output}. Vector $\vec x$ such that the carries have been propagated. \\ -\hline \\ -1. for $n$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\ -\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\ -2. Return($\vec x$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Comba Fixup} -\end{figure} - -With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$. In this case -$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more -efficient than the baseline algorithm why not simply always use this algorithm? - -\subsubsection{Column Weight.} -At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output -independently. A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix -the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of -three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then -an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit inputs the maximum weight of any column is -min$(m, n)$ which is fairly obvious. - -The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall -from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these -two quantities we must not violate the following - -\begin{equation} -k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} -\end{equation} - -Which reduces to - -\begin{equation} -k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} -\end{equation} - -Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is -found. - -\begin{equation} -k < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}} -\end{equation} - -The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$. In this configuration -the smaller input may not have more than $256$ digits if the Comba method is to be used. This is quite satisfactory for most applications since -$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ -\hline \\ -Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ -1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\ -2. If step 1 failed return(\textit{MP\_MEM}).\\ -\\ -Zero the temporary array $\hat W$. \\ -3. for $n$ from $0$ to $digs - 1$ do \\ -\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ -\\ -Compute the columns. \\ -4. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ -\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ -\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ -\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ -\\ -Propagate the carries upwards. \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow digs$ \\ -7. If $digs > 1$ then do \\ -\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ -\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ -\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ -8. else do \\ -\hspace{3mm}8.1 $ix \leftarrow 0$ \\ -9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ -\\ -Zero excess digits. \\ -10. If $digs < oldused$ then do \\ -\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ -\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ -11. Clamp excessive digits of $c$. (\textit{mp\_clamp}) \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_s\_mp\_mul\_digs} -\label{fig:COMBAMULT} -\end{figure} - -\textbf{Algorithm fast\_s\_mp\_mul\_digs.} -This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm -essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster. - -The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that -unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$. - -The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm. The lack of -a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions. Now that each -iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism. - -To measure the benefits of the Comba method over the baseline method consider the number of operations that are required. If the -cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require -$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method requires only $O(pn^2 + qn)$ time, however in practice, -the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply -and addition operations in the nested loop in parallel. - -EXAM,bn_fast_s_mp_mul_digs.c - -The memset on line @47,memset@ clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication -implementation a series of aliases (\textit{lines @67, tmpx@, @70, tmpy@ and @75,_W@}) are used to simplify the inner $O(n^2)$ loop. -In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. - -The inner loop on lines @83,for@, @84,mp_word@ and @85,}@ is where the algorithm will spend the majority of the time, which is why it has been -stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiplication and additions amount to at the -very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three -(\textit{one load, one store, one multiply-add}). For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop -and scheduling the instructions so there are very few dependency stalls. - -In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the -baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next -digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can -be simultaneously used. - -\subsection{Polynomial Basis Multiplication} -To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication. In the following algorithms -the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and -$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required. In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree. - -The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$. The coefficients $w_i$ will -directly yield the desired product when $\beta$ is substituted for $x$. The direct solution to solve for the $2n + 1$ coefficients -requires $O(n^2)$ time and would in practice be slower than the Comba technique. - -However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown -coefficients. This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with -Gaussian elimination. This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in -effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$. - -The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible. However, since -$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place. The benefit of this technique stems from the -fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively. As a result finding the $2n + 1$ relations required -by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs. - -When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$. The $\zeta_0$ term -is simply the product $W(0) = w_0 = a_0 \cdot b_0$. The $\zeta_1$ term is the product -$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$. The third point $\zeta_{\infty}$ is less obvious but rather -simple to explain. The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication. -The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$. Note that the -points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly. - -If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} -$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ for small values of $q$. The term ``mirror point'' stems from the fact that -$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$. For -example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror. - -\begin{eqnarray} -\zeta_{2} = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\ -16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0) -\end{eqnarray} - -Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts. For example, when $n = 2$ the -polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$. This technique of polynomial representation is known as Horner's method. - -As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications. Each multiplication is of -multiplicands that have $n$ times fewer digits than the inputs. The asymptotic running time of this algorithm is -$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}). Figure~\ref{fig:exponent} -summarizes the exponents for various values of $n$. - -\begin{figure} -\begin{center} -\begin{tabular}{|c|c|c|} -\hline \textbf{Split into $n$ Parts} & \textbf{Exponent} & \textbf{Notes}\\ -\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\ -\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\ -\hline $4$ & $1.403677461$ &\\ -\hline $5$ & $1.365212389$ &\\ -\hline $10$ & $1.278753601$ &\\ -\hline $100$ & $1.149426538$ &\\ -\hline $1000$ & $1.100270931$ &\\ -\hline $10000$ & $1.075252070$ &\\ -\hline -\end{tabular} -\end{center} -\caption{Asymptotic Running Time of Polynomial Basis Multiplication} -\label{fig:exponent} -\end{figure} - -At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$. However, the overhead -of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large -numbers. - -\subsubsection{Cutoff Point} -The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach. However, -the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved. This makes the -polynomial basis approach more costly to use with small inputs. - -Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}). There exists a -point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and -when $m > y$ the Comba methods are slower than the polynomial basis algorithms. - -The exact location of $y$ depends on several key architectural elements of the computer platform in question. - -\begin{enumerate} -\item The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc. For example -on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$. The higher the ratio in favour of multiplication the lower -the cutoff point $y$ will be. - -\item The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is. Generally speaking as the number of splits -grows the complexity grows substantially. Ideally solving the system will only involve addition, subtraction and shifting of integers. This -directly reflects on the ratio previous mentioned. - -\item To a lesser extent memory bandwidth and function call overheads. Provided the values are in the processor cache this is less of an -influence over the cutoff point. - -\end{enumerate} - -A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met. For example, if the point -is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster. Finding the cutoff points is fairly simple when -a high resolution timer is available. - -\subsection{Karatsuba Multiplication} -Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for -general purpose multiplication. Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with -light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent. - -\begin{equation} -f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) + ac + bd)x + bd -\end{equation} - -Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product. Applying -this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique. It turns -out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points -$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$. Consider the resultant system of equations. - -\begin{center} -\begin{tabular}{rcrcrcrc} -$\zeta_{0}$ & $=$ & & & & & $w_0$ \\ -$-\zeta_{-1}$ & $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\ -$\zeta_{\infty}$ & $=$ & $w_2$ & & & & \\ -\end{tabular} -\end{center} - -By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for. The simplicity -of this system of equations has made Karatsuba fairly popular. In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.} -making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman. It is worth noting that the point -$\zeta_1$ could be substituted for $-\zeta_{-1}$. In this case the first and third row are subtracted instead of added to the second row. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ -\hline \\ -1. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ -2. If step 2 failed then return(\textit{MP\_MEM}). \\ -\\ -Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ -3. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ -4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ -6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\ -7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ -\\ -Calculate the three products. \\ -8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\ -9. $x1y1 \leftarrow x1 \cdot y1$ \\ -10. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ -11. $x0 \leftarrow y1 - y0$ \\ -12. $t1 \leftarrow t1 \cdot x0$ \\ -\\ -Calculate the middle term. \\ -13. $x0 \leftarrow x0y0 + x1y1$ \\ -14. $t1 \leftarrow x0 - t1$ \\ -\\ -Calculate the final product. \\ -15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\ -16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ -17. $t1 \leftarrow x0y0 + t1$ \\ -18. $c \leftarrow t1 + x1y1$ \\ -19. Clear all of the temporary variables. \\ -20. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_karatsuba\_mul} -\end{figure} - -\textbf{Algorithm mp\_karatsuba\_mul.} -This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm. It is loosely based on the description -from Knuth \cite[pp. 294-295]{TAOCPV2}. - -\index{radix point} -In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen. The radix point chosen must -be used for both of the inputs meaning that it must be smaller than the smallest input. Step 3 chooses the radix point $B$ as half of the -smallest input \textbf{used} count. After the radix point is chosen the inputs are split into lower and upper halves. Step 4 and 5 -compute the lower halves. Step 6 and 7 computer the upper halves. - -After the halves have been computed the three intermediate half-size products must be computed. Step 8 and 9 compute the trivial products -$x0 \cdot y0$ and $x1 \cdot y1$. The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed. By using $x0$ instead -of an additional temporary variable, the algorithm can avoid an addition memory allocation operation. - -The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations. - -EXAM,bn_mp_karatsuba_mul.c - -The new coding element in this routine, not seen in previous routines, is the usage of goto statements. The conventional -wisdom is that goto statements should be avoided. This is generally true, however when every single function call can fail, it makes sense -to handle error recovery with a single piece of code. Lines @61,if@ to @75,if@ handle initializing all of the temporary variables -required. Note how each of the if statements goes to a different label in case of failure. This allows the routine to correctly free only -the temporaries that have been successfully allocated so far. - -The temporary variables are all initialized using the mp\_init\_size routine since they are expected to be large. This saves the -additional reallocation that would have been necessary. Also $x0$, $x1$, $y0$ and $y1$ have to be able to hold at least their respective -number of digits for the next section of code. - -The first algebraic portion of the algorithm is to split the two inputs into their halves. However, instead of using mp\_mod\_2d and mp\_rshd -to extract the halves, the respective code has been placed inline within the body of the function. To initialize the halves, the \textbf{used} and -\textbf{sign} members are copied first. The first for loop on line @98,for@ copies the lower halves. Since they are both the same magnitude it -is simpler to calculate both lower halves in a single loop. The for loop on lines @104,for@ and @109,for@ calculate the upper halves $x1$ and -$y1$ respectively. - -By inlining the calculation of the halves, the Karatsuba multiplier has a slightly lower overhead and can be used for smaller magnitude inputs. - -When line @152,err@ is reached, the algorithm has completed succesfully. The ``error status'' variable $err$ is set to \textbf{MP\_OKAY} so that -the same code that handles errors can be used to clear the temporary variables and return. - -\subsection{Toom-Cook $3$-Way Multiplication} -Toom-Cook $3$-Way \cite{TOOM} multiplication is essentially the polynomial basis algorithm for $n = 3$ except that the points are -chosen such that $\zeta$ is easy to compute and the resulting system of equations easy to reduce. Here, the points $\zeta_{0}$, -$16 \cdot \zeta_{1 \over 2}$, $\zeta_1$, $\zeta_2$ and $\zeta_{\infty}$ make up the five required points to solve for the coefficients -of the $W(x)$. - -With the five relations that Toom-Cook specifies, the following system of equations is formed. - -\begin{center} -\begin{tabular}{rcrcrcrcrcr} -$\zeta_0$ & $=$ & $0w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $1w_0$ \\ -$16 \cdot \zeta_{1 \over 2}$ & $=$ & $1w_4$ & $+$ & $2w_3$ & $+$ & $4w_2$ & $+$ & $8w_1$ & $+$ & $16w_0$ \\ -$\zeta_1$ & $=$ & $1w_4$ & $+$ & $1w_3$ & $+$ & $1w_2$ & $+$ & $1w_1$ & $+$ & $1w_0$ \\ -$\zeta_2$ & $=$ & $16w_4$ & $+$ & $8w_3$ & $+$ & $4w_2$ & $+$ & $2w_1$ & $+$ & $1w_0$ \\ -$\zeta_{\infty}$ & $=$ & $1w_4$ & $+$ & $0w_3$ & $+$ & $0w_2$ & $+$ & $0w_1$ & $+$ & $0w_0$ \\ -\end{tabular} -\end{center} - -A trivial solution to this matrix requires $12$ subtractions, two multiplications by a small power of two, two divisions by a small power -of two, two divisions by three and one multiplication by three. All of these $19$ sub-operations require less than quadratic time, meaning that -the algorithm can be faster than a baseline multiplication. However, the greater complexity of this algorithm places the cutoff point -(\textbf{TOOM\_MUL\_CUTOFF}) where Toom-Cook becomes more efficient much higher than the Karatsuba cutoff point. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_toom\_mul}. \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot b $ \\ -\hline \\ -Split $a$ and $b$ into three pieces. E.g. $a = a_2 \beta^{2k} + a_1 \beta^{k} + a_0$ \\ -1. $k \leftarrow \lfloor \mbox{min}(a.used, b.used) / 3 \rfloor$ \\ -2. $a_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -3. $a_1 \leftarrow \lfloor a / \beta^k \rfloor$, $a_1 \leftarrow a_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -4. $a_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $a_2 \leftarrow a_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -5. $b_0 \leftarrow a \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -6. $b_1 \leftarrow \lfloor a / \beta^k \rfloor$, $b_1 \leftarrow b_1 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -7. $b_2 \leftarrow \lfloor a / \beta^{2k} \rfloor$, $b_2 \leftarrow b_2 \mbox{ (mod }\beta^{k}\mbox{)}$ \\ -\\ -Find the five equations for $w_0, w_1, ..., w_4$. \\ -8. $w_0 \leftarrow a_0 \cdot b_0$ \\ -9. $w_4 \leftarrow a_2 \cdot b_2$ \\ -10. $tmp_1 \leftarrow 2 \cdot a_0$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_2$ \\ -11. $tmp_2 \leftarrow 2 \cdot b_0$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ -12. $w_1 \leftarrow tmp_1 \cdot tmp_2$ \\ -13. $tmp_1 \leftarrow 2 \cdot a_2$, $tmp_1 \leftarrow a_1 + tmp_1$, $tmp_1 \leftarrow 2 \cdot tmp_1$, $tmp_1 \leftarrow tmp_1 + a_0$ \\ -14. $tmp_2 \leftarrow 2 \cdot b_2$, $tmp_2 \leftarrow b_1 + tmp_2$, $tmp_2 \leftarrow 2 \cdot tmp_2$, $tmp_2 \leftarrow tmp_2 + b_0$ \\ -15. $w_3 \leftarrow tmp_1 \cdot tmp_2$ \\ -16. $tmp_1 \leftarrow a_0 + a_1$, $tmp_1 \leftarrow tmp_1 + a_2$, $tmp_2 \leftarrow b_0 + b_1$, $tmp_2 \leftarrow tmp_2 + b_2$ \\ -17. $w_2 \leftarrow tmp_1 \cdot tmp_2$ \\ -\\ -Continued on the next page.\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_toom\_mul} -\end{figure} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_toom\_mul} (continued). \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot b $ \\ -\hline \\ -Now solve the system of equations. \\ -18. $w_1 \leftarrow w_4 - w_1$, $w_3 \leftarrow w_3 - w_0$ \\ -19. $w_1 \leftarrow \lfloor w_1 / 2 \rfloor$, $w_3 \leftarrow \lfloor w_3 / 2 \rfloor$ \\ -20. $w_2 \leftarrow w_2 - w_0$, $w_2 \leftarrow w_2 - w_4$ \\ -21. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ -22. $tmp_1 \leftarrow 8 \cdot w_0$, $w_1 \leftarrow w_1 - tmp_1$, $tmp_1 \leftarrow 8 \cdot w_4$, $w_3 \leftarrow w_3 - tmp_1$ \\ -23. $w_2 \leftarrow 3 \cdot w_2$, $w_2 \leftarrow w_2 - w_1$, $w_2 \leftarrow w_2 - w_3$ \\ -24. $w_1 \leftarrow w_1 - w_2$, $w_3 \leftarrow w_3 - w_2$ \\ -25. $w_1 \leftarrow \lfloor w_1 / 3 \rfloor, w_3 \leftarrow \lfloor w_3 / 3 \rfloor$ \\ -\\ -Now substitute $\beta^k$ for $x$ by shifting $w_0, w_1, ..., w_4$. \\ -26. for $n$ from $1$ to $4$ do \\ -\hspace{3mm}26.1 $w_n \leftarrow w_n \cdot \beta^{nk}$ \\ -27. $c \leftarrow w_0 + w_1$, $c \leftarrow c + w_2$, $c \leftarrow c + w_3$, $c \leftarrow c + w_4$ \\ -28. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_toom\_mul (continued)} -\end{figure} - -\textbf{Algorithm mp\_toom\_mul.} -This algorithm computes the product of two mp\_int variables $a$ and $b$ using the Toom-Cook approach. Compared to the Karatsuba multiplication, this -algorithm has a lower asymptotic running time of approximately $O(n^{1.464})$ but at an obvious cost in overhead. In this -description, several statements have been compounded to save space. The intention is that the statements are executed from left to right across -any given step. - -The two inputs $a$ and $b$ are first split into three $k$-digit integers $a_0, a_1, a_2$ and $b_0, b_1, b_2$ respectively. From these smaller -integers the coefficients of the polynomial basis representations $f(x)$ and $g(x)$ are known and can be used to find the relations required. - -The first two relations $w_0$ and $w_4$ are the points $\zeta_{0}$ and $\zeta_{\infty}$ respectively. The relation $w_1, w_2$ and $w_3$ correspond -to the points $16 \cdot \zeta_{1 \over 2}, \zeta_{2}$ and $\zeta_{1}$ respectively. These are found using logical shifts to independently find -$f(y)$ and $g(y)$ which significantly speeds up the algorithm. - -After the five relations $w_0, w_1, \ldots, w_4$ have been computed, the system they represent must be solved in order for the unknown coefficients -$w_1, w_2$ and $w_3$ to be isolated. The steps 18 through 25 perform the system reduction required as previously described. Each step of -the reduction represents the comparable matrix operation that would be performed had this been performed by pencil. For example, step 18 indicates -that row $1$ must be subtracted from row $4$ and simultaneously row $0$ subtracted from row $3$. - -Once the coeffients have been isolated, the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$ is known. By substituting $\beta^{k}$ for $x$, the integer -result $a \cdot b$ is produced. - -EXAM,bn_mp_toom_mul.c - --- Comments to be added during editing phase. - -\subsection{Signed Multiplication} -Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required. So far all -of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul}. \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot b$ \\ -\hline \\ -1. If $a.sign = b.sign$ then \\ -\hspace{3mm}1.1 $sign = MP\_ZPOS$ \\ -2. else \\ -\hspace{3mm}2.1 $sign = MP\_ZNEG$ \\ -3. If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then \\ -\hspace{3mm}3.1 $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\ -4. else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\ -\hspace{3mm}4.1 $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\ -5. else \\ -\hspace{3mm}5.1 $digs \leftarrow a.used + b.used + 1$ \\ -\hspace{3mm}5.2 If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\ -\hspace{6mm}5.2.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs. \\ -\hspace{3mm}5.3 else \\ -\hspace{6mm}5.3.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs. \\ -6. $c.sign \leftarrow sign$ \\ -7. Return the result of the unsigned multiplication performed. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul} -\end{figure} - -\textbf{Algorithm mp\_mul.} -This algorithm performs the signed multiplication of two inputs. It will make use of any of the three unsigned multiplication algorithms -available when the input is of appropriate size. The \textbf{sign} of the result is not set until the end of the algorithm since algorithm -s\_mp\_mul\_digs will clear it. - -EXAM,bn_mp_mul.c - -The implementation is rather simplistic and is not particularly noteworthy. Line @22,?@ computes the sign of the result using the ``?'' -operator from the C programming language. Line @37,<<@ computes $\delta$ using the fact that $1 << k$ is equal to $2^k$. - -\section{Squaring} - -Squaring is a special case of multiplication where both multiplicands are equal. At first it may seem like there is no significant optimization -available but in fact there is. Consider the multiplication of $576$ against $241$. In total there will be nine single precision multiplications -performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot 6$, $2 \cdot 7$ and $2 \cdot 5$. Now consider -the multiplication of $123$ against $123$. The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, -$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$. On closer inspection some of the products are equivalent. For example, $3 \cdot 2 = 2 \cdot 3$ -and $3 \cdot 1 = 1 \cdot 3$. - -For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$ -required for multiplication. The following diagram gives an example of the operations required. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{ccccc|c} -&&1&2&3&\\ -$\times$ &&1&2&3&\\ -\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\ - & $2 \cdot 1$ & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\ - $1 \cdot 1$ & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\ -\end{tabular} -\end{center} -\caption{Squaring Optimization Diagram} -\end{figure} - -MARK,SQUARE -Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious. For the purposes of this discussion let $x$ -represent the number being squared. The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it. - -The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product. Every non-square term of a column will -appear twice hence the name ``double product''. Every odd column is made up entirely of double products. In fact every column is made up of double -products and at most one square (\textit{see the exercise section}). - -The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, -occurs at column $2k + 1$. For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. -Column two of row one is a square and column three is the first unique column. - -\subsection{The Baseline Squaring Algorithm} -The baseline squaring algorithm is meant to be a catch-all squaring algorithm. It will handle any of the input sizes that the faster routines -will not handle. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -1. Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits. (\textit{mp\_init\_size}) \\ -2. If step 1 failed return(\textit{MP\_MEM}) \\ -3. $t.used \leftarrow 2 \cdot a.used + 1$ \\ -4. For $ix$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}Calculate the square. \\ -\hspace{3mm}4.1 $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\ -\hspace{3mm}4.2 $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}Calculate the double products after the square. \\ -\hspace{3mm}4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}4.4 For $iy$ from $ix + 1$ to $a.used - 1$ do \\ -\hspace{6mm}4.4.1 $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\ -\hspace{6mm}4.4.2 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}4.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}Set the last carry. \\ -\hspace{3mm}4.5 While $u > 0$ do \\ -\hspace{6mm}4.5.1 $iy \leftarrow iy + 1$ \\ -\hspace{6mm}4.5.2 $\hat r \leftarrow t_{ix + iy} + u$ \\ -\hspace{6mm}4.5.3 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}4.5.4 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -5. Clamp excess digits of $t$. (\textit{mp\_clamp}) \\ -6. Exchange $b$ and $t$. \\ -7. Clear $t$ (\textit{mp\_clear}) \\ -8. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_sqr} -\end{figure} - -\textbf{Algorithm s\_mp\_sqr.} -This algorithm computes the square of an input using the three observations on squaring. It is based fairly faithfully on algorithm 14.16 of HAC -\cite[pp.596-597]{HAC}. Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring. This allows the -destination mp\_int to be the same as the source mp\_int. - -The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while -the inner loop computes the columns of the partial result. Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate -the carry and compute the double products. - -The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this -very algorithm. The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that -when it is multiplied by two, it can be properly represented by a mp\_word. - -Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial -results calculated so far. This involves expensive carry propagation which will be eliminated in the next algorithm. - -EXAM,bn_s_mp_sqr.c - -Inside the outer loop (\textit{see line @32,for@}) the square term is calculated on line @35,r =@. Line @42,>>@ extracts the carry from the square -term. Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines @45,tmpx@ and @48,tmpt@ respectively. The doubling is performed using two -additions (\textit{see line @57,r + r@}) since it is usually faster than shifting,if not at least as fast. - -\subsection{Faster Squaring by the ``Comba'' Method} -A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop. Squaring has an additional -drawback that it must double the product inside the inner loop as well. As for multiplication, the Comba technique can be used to eliminate these -performance hazards. - -The first obvious solution is to make an array of mp\_words which will hold all of the columns. This will indeed eliminate all of the carry -propagation operations from the inner loop. However, the inner product must still be doubled $O(n^2)$ times. The solution stems from the simple fact -that $2a + 2b + 2c = 2(a + b + c)$. That is the sum of all of the double products is equal to double the sum of all the products. For example, -$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$. - -However, we cannot simply double all of the columns, since the squares appear only once per row. The most practical solution is to have two mp\_word -arrays. One array will hold the squares and the other array will hold the double products. With both arrays the doubling and carry propagation can be -moved to a $O(n)$ work level outside the $O(n^2)$ level. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\ -1. If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits. (\textit{mp\_grow}). \\ -2. If step 1 failed return(\textit{MP\_MEM}). \\ -3. for $ix$ from $0$ to $2a.used + 1$ do \\ -\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ -\hspace{3mm}3.2 $\hat {X}_{ix} \leftarrow 0$ \\ -4. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}Compute the square.\\ -\hspace{3mm}4.1 $\hat {X}_{ix+ix} \leftarrow \left ( a_ix \right )^2$ \\ -\\ -\hspace{3mm}Compute the double products.\\ -\hspace{3mm}4.2 for $iy$ from $ix + 1$ to $a.used - 1$ do \\ -\hspace{6mm}4.2.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\ -5. $oldused \leftarrow b.used$ \\ -6. $b.used \leftarrow 2a.used + 1$ \\ -\\ -Double the products and propagate the carries simultaneously. \\ -7. $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\ -8. for $ix$ from $1$ to $2a.used$ do \\ -\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\ -\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\ -\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\ -9. $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\ -10. if $2a.used + 1 < oldused$ then do \\ -\hspace{3mm}10.1 for $ix$ from $2a.used + 1$ to $oldused$ do \\ -\hspace{6mm}10.1.1 $b_{ix} \leftarrow 0$ \\ -11. Clamp excess digits from $b$. (\textit{mp\_clamp}) \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_s\_mp\_sqr} -\end{figure} - -\textbf{Algorithm fast\_s\_mp\_sqr.} -This algorithm computes the square of an input using the Comba technique. It is designed to be a replacement for algorithm s\_mp\_sqr when -the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$. - -This routine requires two arrays of mp\_words to be placed on the stack. The first array $\hat W$ will hold the double products and the second -array $\hat X$ will hold the squares. Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most -processors to simply make it a full size array. - -The loop on step 3 will zero the two arrays to prepare them for the squaring step. Step 4.1 computes the squares of the product. Note how -it simply assigns the value into the $\hat X$ array. The nested loop on step 4.2 computes the doubles of the products. This loop -computes the sum of the products for each column. They are not doubled until later. - -After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards. It makes sense to do both -operations at the same time. The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the -squares in place. - -EXAM,bn_fast_s_mp_sqr.c - --- Write something deep and insightful later, Tom. - -\subsection{Polynomial Basis Squaring} -The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring. The minor exception -is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$. Instead of performing $2n + 1$ -multiplications to find the $\zeta$ relations, squaring operations are performed instead. - -\subsection{Karatsuba Squaring} -Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square. -Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial. The Karatsuba equation can be modified to square a -number with the following equation. - -\begin{equation} -h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2 -\end{equation} - -Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$. As in -Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of -$O \left ( n^{lg(3)} \right )$. - -You might ask yourself, if the asymptotic time of Karatsuba squaring and multiplication is the same, why not simply use the multiplication algorithm -instead? The answer to this arises from the cutoff point for squaring. As in multiplication there exists a cutoff point, at which the -time required for a Comba based squaring and a Karatsuba based squaring meet. Due to the overhead inherent in the Karatsuba method, the cutoff -point is fairly high. For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits. - -Consider squaring a 200 digit number with this technique. It will be split into two 100 digit halves which are subsequently squared. -The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm. If Karatsuba multiplication -were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -1. Initialize the following temporary mp\_ints: $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\ -2. If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\ -\\ -Split the input. e.g. $a = x1\beta^B + x0$ \\ -3. $B \leftarrow \lfloor a.used / 2 \rfloor$ \\ -4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -5. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\ -\\ -Calculate the three squares. \\ -6. $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\ -7. $x1x1 \leftarrow x1^2$ \\ -8. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ -9. $t1 \leftarrow t1^2$ \\ -\\ -Compute the middle term. \\ -10. $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\ -11. $t1 \leftarrow t2 - t1$ \\ -\\ -Compute final product. \\ -12. $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\ -13. $x1x1 \leftarrow x1x1\beta^{2B}$ \\ -14. $t1 \leftarrow t1 + x0x0$ \\ -15. $b \leftarrow t1 + x1x1$ \\ -16. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_karatsuba\_sqr} -\end{figure} - -\textbf{Algorithm mp\_karatsuba\_sqr.} -This algorithm computes the square of an input $a$ using the Karatsuba technique. This algorithm is very similar to the Karatsuba based -multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings. - -The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is -placed just below the middle. Step 3, 4 and 5 compute the two halves required using $B$ -as the radix point. The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form. - -By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$. -Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then -this method is faster. Assuming no further recursions occur, the difference can be estimated with the following inequality. - -Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or -machine clock cycles.}. - -\begin{equation} -5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2 -\end{equation} - -For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$. This implies that the following inequality should hold. -\begin{center} -\begin{tabular}{rcl} -${5n \over 3} + 3n^2 + 3n$ & $<$ & ${n \over 3} + 6n^2$ \\ -${5 \over 3} + 3n + 3$ & $<$ & ${1 \over 3} + 6n$ \\ -${13 \over 9}$ & $<$ & $n$ \\ -\end{tabular} -\end{center} - -This results in a cutoff point around $n = 2$. As a consequence it is actually faster to compute the middle term the ``long way'' on processors -where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication. On -the Intel P4 processor this ratio is 1:29 making this method even more beneficial. The only common exception is the ARMv4 processor which has a -ratio of 1:7. } than simpler operations such as addition. - -EXAM,bn_mp_karatsuba_sqr.c - -This implementation is largely based on the implementation of algorithm mp\_karatsuba\_mul. It uses the same inline style to copy and -shift the input into the two halves. The loop from line @54,{@ to line @70,}@ has been modified since only one input exists. The \textbf{used} -count of both $x0$ and $x1$ is fixed up and $x0$ is clamped before the calculations begin. At this point $x1$ and $x0$ are valid equivalents -to the respective halves as if mp\_rshd and mp\_mod\_2d had been used. - -By inlining the copy and shift operations the cutoff point for Karatsuba multiplication can be lowered. On the Athlon the cutoff point -is exactly at the point where Comba squaring can no longer be used (\textit{128 digits}). On slower processors such as the Intel P4 -it is actually below the Comba limit (\textit{at 110 digits}). - -This routine uses the same error trap coding style as mp\_karatsuba\_sqr. As the temporary variables are initialized errors are redirected to -the error trap higher up. If the algorithm completes without error the error code is set to \textbf{MP\_OKAY} and mp\_clears are executed normally. - -\textit{Last paragraph sucks. re-write! -- Tom} - -\subsection{Toom-Cook Squaring} -The Toom-Cook squaring algorithm mp\_toom\_sqr is heavily based on the algorithm mp\_toom\_mul with the exception that squarings are used -instead of multiplication to find the five relations.. The reader is encouraged to read the description of the latter algorithm and try to -derive their own Toom-Cook squaring algorithm. - -\subsection{High Level Squaring} -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -1. If $a.used \ge TOOM\_SQR\_CUTOFF$ then \\ -\hspace{3mm}1.1 $b \leftarrow a^2$ using algorithm mp\_toom\_sqr \\ -2. else if $a.used \ge KARATSUBA\_SQR\_CUTOFF$ then \\ -\hspace{3mm}2.1 $b \leftarrow a^2$ using algorithm mp\_karatsuba\_sqr \\ -3. else \\ -\hspace{3mm}3.1 $digs \leftarrow a.used + b.used + 1$ \\ -\hspace{3mm}3.2 If $digs < MP\_ARRAY$ and $a.used \le \delta$ then \\ -\hspace{6mm}3.2.1 $b \leftarrow a^2$ using algorithm fast\_s\_mp\_sqr. \\ -\hspace{3mm}3.3 else \\ -\hspace{6mm}3.3.1 $b \leftarrow a^2$ using algorithm s\_mp\_sqr. \\ -4. $b.sign \leftarrow MP\_ZPOS$ \\ -5. Return the result of the unsigned squaring performed. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_sqr} -\end{figure} - -\textbf{Algorithm mp\_sqr.} -This algorithm computes the square of the input using one of four different algorithms. If the input is very large and has at least -\textbf{TOOM\_SQR\_CUTOFF} or \textbf{KARATSUBA\_SQR\_CUTOFF} digits then either the Toom-Cook or the Karatsuba Squaring algorithm is used. If -neither of the polynomial basis algorithms should be used then either the Comba or baseline algorithm is used. - -EXAM,bn_mp_sqr.c - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\ - & that have different number of digits in Karatsuba multiplication. \\ - & \\ -$\left [ 3 \right ] $ & In ~SQUARE~ the fact that every column of a squaring is made up \\ - & of double products and at most one square is stated. Prove this statement. \\ - & \\ -$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\ - & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\ - & \\ -$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\ - & \\ -$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\ - & \\ -$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\ - & required for equation $6.7$ to be true. \\ - & \\ -\end{tabular} - -\chapter{Modular Reduction} -MARK,REDUCTION -\section{Basics of Modular Reduction} -\index{modular residue} -Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, -such as factoring. Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set. A number $a$ is said to be reduced -modulo another number $b$ by finding the remainder of the division $a/b$. - -Modular reduction is equivalent to solving for $r$ in the following equation. $a = bq + r$ where $q = \lfloor a/b \rfloor$. The result -$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$. In other vernacular $r$ is known as the -``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and -other forms of residues. - -\index{modulus} -Modular reductions are normally used to form finite groups such as fields and rings. For example, in the RSA public key algorithm \cite{RSAPAPER} -two private primes $p$ and $q$ are chosen which when multiplied $n = pq$ forms a composite modulus. When operations such as multiplication and -squaring are performed on units of the ring $\Z_n$ a finite multiplicative sub-group is formed. - -Modular reductions have a variety of other useful properties. For example, a number $x$ is a square if and only if it is a quadratic -residue modulo a prime. With a finite set of primes $B = \left < p_0, p_1, \ldots, p_n \right >$ a quick test for whether $x$ is square or not can -be performed\footnote{Provided none of the primes from $B$ divide $x$.}. Consider the figure~\ref{fig:QR} with the candiate $x = 955621$ a simple -set of modular reductions modulo $3, 5, \ldots, 11$ may detect whether $x$ is a square or not. In this case $955621 \equiv 7 \mbox{ (mod }11\mbox{)}$ -and since $7$ is not a quadratic residue modulo $11$ the number $955621$ is not a square. - -\begin{figure} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Prime} & \textbf{Quadratic Residues} \\ -\hline $3$ & $1$ \\ -\hline $5$ & $1, 4$ \\ -\hline $7$ & $1, 2, 4$ \\ -\hline $11$ & $1, 3, 4, 5, 9$ \\ -\hline -\end{tabular} -\end{center} -\caption{Quadratic Residues for primes less than $13$} -\label{fig:QR} -\end{figure} - -The most common usage for performance driven modular reductions is in modular exponentiation algorithms. That is to compute -$d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible. As will be discussed in the subsequent chapter there exists fast algorithms for computing -modular exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications. These algorithms will produce partial -results in the range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms. - -\section{The Barrett Reduction} -The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate -division. Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to - -\begin{equation} -c = a - b \cdot \lfloor a/b \rfloor -\end{equation} - -Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP intuition would indicate the next step -would be to replace $a/b$ by a multiplication by the reciprocal. However, DSP intuition on its own will not work as these numbers are considerably -larger than the precision of common DSP floating point data types. It would take another common optimization to optimize the algorithm. - -\subsection{Fixed Point Arithmetic} -The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers. Fixed -point arithmetic would vastly popularlize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were fairly slow. The idea behind -fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit integer and a $q$-bit fraction part -(\textit{where $p+q = k$}). - -In this system a $k$-bit integer $n$ would actually represent $n/2^q$. For example, with $q = 4$ the integer $n = 37$ would actually represent the -value $2.3125$. To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized. For example, -with $q = 4$ to multiply the integers $9$ and $5$ they must be converted to fixed point first by multiplying by $2^q$. Let $a = 9(2^q)$ -represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the fixed point representation of $5$. The product $ab$ is equal to -$45(2^{2q})$ which when normalized produces $45(2^q)$. - -Using fixed point arithmetic division can be easily achieved by multiplying by the reciprocal. If $2^q$ is equivalent to one than $2^q/b$ is -equivalent to $1/b$ using real arithmetic. Using this fact dividing an integer $a$ by another integer $b$ can be achieved with the following -expression. - -\begin{equation} -\lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor -\end{equation} - -The precision of the division is proportional to the value of $q$. If the divisor $b$ is used frequently as is the case with -modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift. Both operations -are considerably faster than division on most processors. - -Consider dividing $19$ by $5$. The correct result is $\lfloor 19/5 \rfloor = 3$. With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which -leads to a product of $19$ which when divided by $2^q$ produces $2$. However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and -the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct. - -Plugging this form of divison into the original equation the following modular residue equation arises. - -\begin{equation} -c = a - b \cdot \lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor -\end{equation} - -Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol. Using the $\mu$ -variable also helps re-inforce the idea that it is meant to be computed once and re-used. - -\begin{equation} -c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor -\end{equation} - -Provided that $2^q > b^2$ this algorithm will produce a quotient that is either exactly correct or off by a value of one. Let $n$ represent -the number of digits in $b$. This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and -another $n^2$ single precision multiplications to find the residue. In total $3n^2$ single precision multiplications are required to -reduce the number. - -For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$. Consider reducing -$a = 180388626447$ modulo $b$ using the above reduction equation. The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$. -By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found. - -\subsection{Choosing a Radix Point} -Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications. If that were the best -that could be achieved a full division might as well be used in its place. The key to optimizing the reduction is to reduce the precision of -the initial multiplication that finds the quotient. - -Let $a$ represent the number of which the residue is sought. Let $b$ represent the modulus used to find the residue. Let $m$ represent -the number of digits in $b$. For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$. Dividing $a$ by -$b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer. Digits below the $m - 1$'th digit of $a$ will contribute at most a value -of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$. - -Since those digits do not contribute much to the quotient the observation is that they might as well be zero. However, if the digits -``might as well be zero'' they might as well not be there in the first place. Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input -with the zeroes trimmed. Now the modular reduction is trimmed to the almost equivalent equation - -\begin{equation} -c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor -\end{equation} - -Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$. Also note that the exponent on the divisor when added to the amount $q_0$ -was shifted by equals $2m$. If the optimization had not been performed the divisor would have the exponent $2m$ so in the end the exponents -do ``add up''. Using the above equation the quotient $\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most -two implying that $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. By first subtracting $b$ times the quotient and then -conditionally subtracting $b$ once or twice the residue is found. - -The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single -precision multiplications. In total $2m^2 + m$ single precision multiplications are required which is considerably faster than the original -attempt. - -For example, let $\beta = 10$ represent the radix of the digits. Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ -represent the value of which the residue is desired. In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$. -With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$. The quotient is then -$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$. Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ -is found. - -\subsection{Trimming the Quotient} -So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications. As -it stands now the algorithm is already fairly fast compared to a full integer division algorithm. However, there is still room for -optimization. - -After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower -half of the product. It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision -multiplications. If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly. -In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed. - -The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number. Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision -multiplications would be required. Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number -of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications. - -\subsection{Trimming the Residue} -After the quotient has been calculated it is used to reduce the input. As previously noted the algorithm is not exact and it can be off by a small -multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. If $b$ is $m$ digits than the -result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are -implicitly zero. - -The next optimization arises from this very fact. Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full -$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed. Similarly the value of $a$ can -be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well. A multiplication that produces -only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications. - -With both optimizations in place the algorithm is the algorithm Barrett proposed. It requires $m^2 + 2m - 1$ single precision multiplications which -is considerably faster than the straightforward $3m^2$ method. - -\subsection{The Barrett Algorithm} -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor$ $(0 \le a < b^2, b > 1)$ \\ -\textbf{Output}. $c \leftarrow a \mbox{ (mod }b\mbox{)}$ \\ -\hline \\ -Let $m$ represent the number of digits in $b$. \\ -1. Make a copy of $a$ and store it in $q$. (\textit{mp\_init\_copy}) \\ -2. $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\ -\\ -Produce the quotient. \\ -3. $q \leftarrow q \cdot \mu$ (\textit{note: only produce digits at or above $m-1$}) \\ -4. $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\ -\\ -Subtract the multiple of modulus from the input. \\ -5. $c \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -6. $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\ -7. $c \leftarrow c - q$ (\textit{mp\_sub}) \\ -\\ -Add $\beta^{m+1}$ if a carry occured. \\ -8. If $c < 0$ then (\textit{mp\_cmp\_d}) \\ -\hspace{3mm}8.1 $q \leftarrow 1$ (\textit{mp\_set}) \\ -\hspace{3mm}8.2 $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\ -\hspace{3mm}8.3 $c \leftarrow c + q$ \\ -\\ -Now subtract the modulus if the residue is too large (e.g. quotient too small). \\ -9. While $c \ge b$ do (\textit{mp\_cmp}) \\ -\hspace{3mm}9.1 $c \leftarrow c - b$ \\ -10. Clear $q$. \\ -11. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce} -\end{figure} - -\textbf{Algorithm mp\_reduce.} -This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm. It is loosely based on algorithm 14.42 of HAC -\cite[pp. 602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}. The algorithm has several restrictions and assumptions which must be adhered to -for the algorithm to work. - -First the modulus $b$ is assumed to be positive and greater than one. If the modulus were less than or equal to one than subtracting -a multiple of it would either accomplish nothing or actually enlarge the input. The input $a$ must be in the range $0 \le a < b^2$ in order -for the quotient to have enough precision. Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish. The -value of $\mu$ is passed as an argument to this algorithm and is assumed to be setup before the algorithm is used. - -Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position. An algorithm called -$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task. This optimal algorithm can only be used if the number -of digits in $b$ is very much smaller than $\beta$. - -After the multiple of the modulus has been subtracted from $a$ the residue must be fixed up in case its negative. While it is known that -$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue. In this case -the invariant $\beta^{m+1}$ must be added to the residue to make it positive again. - -The while loop at step 9 will subtract $b$ until the residue is less than $b$. If the algorithm is performed correctly this step is only -performed upto two times. However, if $a \ge b^2$ than it will iterate substantially more times than it should. - -EXAM,bn_mp_reduce.c - -The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up. This essentially halves -the number of single precision multiplications required. However, the optimization is only safe if $\beta$ is much larger than the number of digits -in the modulus. In the source code this is evaluated on lines @36,if@ to @44,}@ where algorithm s\_mp\_mul\_high\_digs is used when it is -safe to do so. - -\subsection{The Barrett Setup Algorithm} -In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance. Ideally this value should be computed once and stored for -future use so that the Barrett algorithm can be used without delay. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_setup}. \\ -\textbf{Input}. mp\_int $a$ ($a > 1$) \\ -\textbf{Output}. $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\ -\hline \\ -1. $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot m}$ (\textit{mp\_2expt}) \\ -2. $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\ -3. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_setup} -\end{figure} - -\textbf{Algorithm mp\_reduce\_setup.} -This algorithm computes the reciprocal $\mu$ required for Barrett reduction. First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot m}$ which -is equivalent and much faster. The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$. - -EXAM,bn_mp_reduce_setup.c - -This simple routine calculates the reciprocal $\mu$ required by Barrett reduction. Note the extended usage of algorithm mp\_div where the variable -which would received the remainder is passed as NULL. As will be discussed in ~DIVISION~ the division routine allows both the quotient and the -remainder to be passed as NULL meaning to ignore the value. - -\section{The Montgomery Reduction} -Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting -form of reduction in common use. It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a -residue times a constant. However, as perplexing as this may sound the algorithm is relatively simple and very efficient. - -Throughout this entire section the variable $n$ will represent the modulus used to form the residue. As will be discussed shortly the value of -$n$ must be odd. The variable $x$ will represent the quantity of which the residue is sought. Similar to the Barrett algorithm the input -is restricted to $0 \le x < n^2$. To begin the description some simple number theory facts must be established. - -\textbf{Fact 1.} Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$. Another way -to explain this is that $n$ (\textit{or multiples of $n$}) is congruent to zero modulo $n$. Adding zero will not change the value of the residue. - -\textbf{Fact 2.} If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$. Actually -this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to -multiplication by $k^{-1}$ modulo $n$. - -From these two simple facts the following simple algorithm can be derived. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction}. \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $1$ to $k$ do \\ -\hspace{3mm}1.1 If $x$ is odd then \\ -\hspace{6mm}1.1.1 $x \leftarrow x + n$ \\ -\hspace{3mm}1.2 $x \leftarrow x/2$ \\ -2. Return $x$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction} -\end{figure} - -The algorithm reduces the input one bit at a time using the two congruencies stated previously. Inside the loop $n$, which is odd, is -added to $x$ if $x$ is odd. This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two. Since -$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$. Let $r$ represent the -final result of the Montgomery algorithm. If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to -$0 \le r < \lfloor x/2^k \rfloor + n$. As a result at most a single subtraction is required to get the residue desired. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ -\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\ -\hline $2$ & $x/2 = 1453$ \\ -\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\ -\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\ -\hline $5$ & $x/2 = 278$ \\ -\hline $6$ & $x/2 = 139$ \\ -\hline $7$ & $x + n = 396$, $x/2 = 198$ \\ -\hline $8$ & $x/2 = 99$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example of Montgomery Reduction (I)} -\label{fig:MONT1} -\end{figure} - -Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$. The final result $r = 99$ which is actually -$2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$ can reveal the residue $x \equiv 158$ by multiplying by $2^8$ modulo $n$. - -Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$. The current algorithm requires $2k^2$ single precision shifts -and $k^2$ single precision additions. At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful. -Fortunately there exists an alternative representation of the algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 If the $t$'th bit of $x$ is one then \\ -\hspace{6mm}1.1.1 $x \leftarrow x + 2^tn$ \\ -2. Return $x/2^k$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction (modified I)} -\end{figure} - -This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2. The number of single -precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ -\hline $1$ & $x + 2^{0}n = 5812$ \\ -\hline $2$ & $5812$ \\ -\hline $3$ & $x + 2^{2}n = 6840$ \\ -\hline $4$ & $x + 2^{3}n = 8896$ \\ -\hline $5$ & $8896$ \\ -\hline $6$ & $8896$ \\ -\hline $7$ & $x + 2^{6}n = 25344$ \\ -\hline $8$ & $25344$ \\ -\hline -- & $x/2^k = 99$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example of Montgomery Reduction (II)} -\label{fig:MONT2} -\end{figure} - -Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 4093$ modulo $n = 257$ with $k = 8$. -With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the -loop. Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed. In those iterations the $t$'th bit of $x$ is -zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero. - -\subsection{Digit Based Montgomery Reduction} -Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis. Consider the -previous algorithm re-written to compute the Montgomery reduction in this new fashion. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 $x \leftarrow x + \mu n \beta^t$ \\ -2. Return $x/\beta^k$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction (modified II)} -\end{figure} - -The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue. If the first digit of -the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit. This -problem breaks down to solving the following congruency. - -\begin{center} -\begin{tabular}{rcl} -$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\ -$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\ -$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ -\end{tabular} -\end{center} - -In each iteration of the loop on step 1 a new value of $\mu$ must be calculated. The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used -extensively in this algorithm and should be precomputed. Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$. - -For example, let $\beta = 10$ represent the radix. Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$. Let $x = 33$ -represent the value to reduce. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{|c|c|c|} -\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\ -\hline -- & $33$ & --\\ -\hline $0$ & $33 + \mu n = 50$ & $1$ \\ -\hline $1$ & $50 + \mu n \beta = 900$ & $5$ \\ -\hline -\end{tabular} -\end{center} -\caption{Example of Montgomery Reduction} -\end{figure} - -The final result $900$ is then divided by $\beta^k$ to produce the final result $9$. The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ -which implies the result is not the modular residue of $x$ modulo $n$. However, recall that the residue is actually multiplied by $\beta^{-k}$ in -the algorithm. To get the true residue the value must be multiplied by $\beta^k$. In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and -the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$. - -\subsection{Baseline Montgomery Reduction} -The baseline Montgomery reduction algorithm will produce the residue for any size input. It is designed to be a catch-all algororithm for -Montgomery reductions. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\ -\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ -\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. $digs \leftarrow 2n.used + 1$ \\ -2. If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\ -\hspace{3mm}2.1 Use algorithm fast\_mp\_montgomery\_reduce instead. \\ -\\ -Setup $x$ for the reduction. \\ -3. If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\ -4. $x.used \leftarrow digs$ \\ -\\ -Eliminate the lower $k$ digits. \\ -5. For $ix$ from $0$ to $k - 1$ do \\ -\hspace{3mm}5.1 $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}5.2 $u \leftarrow 0$ \\ -\hspace{3mm}5.3 For $iy$ from $0$ to $k - 1$ do \\ -\hspace{6mm}5.3.1 $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\ -\hspace{6mm}5.3.2 $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}5.3.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}5.4 While $u > 0$ do \\ -\hspace{6mm}5.4.1 $iy \leftarrow iy + 1$ \\ -\hspace{6mm}5.4.2 $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\ -\hspace{6mm}5.4.3 $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\ -\hspace{6mm}5.4.4 $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\ -\\ -Divide by $\beta^k$ and fix up as required. \\ -6. $x \leftarrow \lfloor x / \beta^k \rfloor$ \\ -7. If $x \ge n$ then \\ -\hspace{3mm}7.1 $x \leftarrow x - n$ \\ -8. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_montgomery\_reduce} -\end{figure} - -\textbf{Algorithm mp\_montgomery\_reduce.} -This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm. The algorithm is loosely based -on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop. The -restrictions on this algorithm are fairly easy to adapt to. First $0 \le x < n^2$ bounds the input to numbers in the same range as -for the Barrett algorithm. Additionally $n > 1$ will ensure a modular inverse $\rho$ exists. $\rho$ must be calculated in -advance of this algorithm. Finally the variable $k$ is fixed and a pseudonym for $n.used$. - -Step 2 decides whether a faster Montgomery algorithm can be used. It is based on the Comba technique meaning that there are limits on -the size of the input. This algorithm is discussed in ~COMBARED~. - -Step 5 is the main reduction loop of the algorithm. The value of $\mu$ is calculated once per iteration in the outer loop. The inner loop -calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits. Both the addition and -multiplication are performed in the same loop to save time and memory. Step 5.4 will handle any additional carries that escape the inner loop. - -Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications -in the inner loop. In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision -multiplications. - -EXAM,bn_mp_montgomery_reduce.c - -This is the baseline implementation of the Montgomery reduction algorithm. Lines @30,digs@ to @35,}@ determine if the Comba based -routine can be used instead. Line @47,mu@ computes the value of $\mu$ for that particular iteration of the outer loop. - -The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop. The alias $tmpx$ refers to the $ix$'th digit of $x$ and -the alias $tmpn$ refers to the modulus $n$. - -\subsection{Faster ``Comba'' Montgomery Reduction} -MARK,COMBARED - -The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial -nature of the inner loop. The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba -technique. The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates -a $k \times 1$ product $k$ times. - -The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$. This means the -carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit. The solution as it turns out is very simple. -Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry. - -With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases -the speed of the algorithm. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\ -\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ -\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\ -1. if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\ -Copy the digits of $x$ into the array $\hat W$ \\ -2. For $ix$ from $0$ to $x.used - 1$ do \\ -\hspace{3mm}2.1 $\hat W_{ix} \leftarrow x_{ix}$ \\ -3. For $ix$ from $x.used$ to $2n.used - 1$ do \\ -\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ -Elimiate the lower $k$ digits. \\ -4. for $ix$ from $0$ to $n.used - 1$ do \\ -\hspace{3mm}4.1 $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}4.2 For $iy$ from $0$ to $n.used - 1$ do \\ -\hspace{6mm}4.2.1 $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\ -\hspace{3mm}4.3 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ -Propagate carries upwards. \\ -5. for $ix$ from $n.used$ to $2n.used + 1$ do \\ -\hspace{3mm}5.1 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ -Shift right and reduce modulo $\beta$ simultaneously. \\ -6. for $ix$ from $0$ to $n.used + 1$ do \\ -\hspace{3mm}6.1 $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\ -Zero excess digits and fixup $x$. \\ -7. if $x.used > n.used + 1$ then do \\ -\hspace{3mm}7.1 for $ix$ from $n.used + 1$ to $x.used - 1$ do \\ -\hspace{6mm}7.1.1 $x_{ix} \leftarrow 0$ \\ -8. $x.used \leftarrow n.used + 1$ \\ -9. Clamp excessive digits of $x$. \\ -10. If $x \ge n$ then \\ -\hspace{3mm}10.1 $x \leftarrow x - n$ \\ -11. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_mp\_montgomery\_reduce} -\end{figure} - -\textbf{Algorithm fast\_mp\_montgomery\_reduce.} -This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique. It is on most computer platforms significantly -faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}). The algorithm has the same restrictions -on the input as the baseline reduction algorithm. An additional two restrictions are imposed on this algorithm. The number of digits $k$ in the -the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$. When $\beta = 2^{28}$ this algorithm can be used to reduce modulo -a modulus of at most $3,556$ bits in length. - -As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product. It is initially filled with the -contents of $x$ with the excess digits zeroed. The reduction loop is very similar the to the baseline loop at heart. The multiplication on step -4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$. Some multipliers such -as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce. By performing -a single precision multiplication instead half the amount of time is spent. - -Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work. That is what step -4.3 will do. In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards. Note -how the upper bits of those same words are not reduced modulo $\beta$. This is because those values will be discarded shortly and there is no -point. - -Step 5 will propgate the remainder of the carries upwards. On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are -stored in the destination $x$. - -EXAM,bn_fast_mp_montgomery_reduce.c - -The $\hat W$ array is first filled with digits of $x$ on line @49,for@ then the rest of the digits are zeroed on line @54,for@. Both loops share -the same alias variables to make the code easier to read. - -The value of $\mu$ is calculated in an interesting fashion. First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit. This -forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision. Line @101,>>@ fixes the carry -for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$. - -The for loop on line @113,for@ propagates the rest of the carries upwards through the columns. The for loop on line @126,for@ reduces the columns -modulo $\beta$ and shifts them $k$ places at the same time. The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th -digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$. - -\subsection{Montgomery Setup} -To calculate the variable $\rho$ a relatively simple algorithm will be required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_montgomery\_setup}. \\ -\textbf{Input}. mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\ -\textbf{Output}. $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ -\hline \\ -1. $b \leftarrow n_0$ \\ -2. If $b$ is even return(\textit{MP\_VAL}) \\ -3. $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\ -4. for $k$ from 0 to $3$ do \\ -\hspace{3mm}4.1 $x \leftarrow x \cdot (2 - bx)$ \\ -5. $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\ -6. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_montgomery\_setup} -\end{figure} - -\textbf{Algorithm mp\_montgomery\_setup.} -This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms. It uses a very interesting trick -to calculate $1/n_0$ when $\beta$ is a power of two. - -EXAM,bn_mp_montgomery_setup.c - -This source code computes the value of $\rho$ required to perform Montgomery reduction. It has been modified to avoid performing excess -multiplications when $\beta$ is not the default 28-bits. - -\section{The Diminished Radix Algorithm} -The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett -or Montgomery methods for certain forms of moduli. The technique is based on the following simple congruence. - -\begin{equation} -(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)} -\end{equation} - -This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive. It used the fact that if $n = 2^{31}$ and $k=1$ that -then a x86 multiplier could produce the 62-bit product and use the ``shrd'' instruction to perform a double-precision right shift. The proof -of the above equation is very simple. First write $x$ in the product form. - -\begin{equation} -x = qn + r -\end{equation} - -Now reduce both sides modulo $(n - k)$. - -\begin{equation} -x \equiv qk + r \mbox{ (mod }(n-k)\mbox{)} -\end{equation} - -The variable $n$ reduces modulo $n - k$ to $k$. By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ -into the equation the original congruence is reproduced, thus concluding the proof. The following algorithm is based on this observation. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Diminished Radix Reduction}. \\ -\textbf{Input}. Integer $x$, $n$, $k$ \\ -\textbf{Output}. $x \mbox{ mod } (n - k)$ \\ -\hline \\ -1. $q \leftarrow \lfloor x / n \rfloor$ \\ -2. $q \leftarrow k \cdot q$ \\ -3. $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\ -4. $x \leftarrow x + q$ \\ -5. If $x \ge (n - k)$ then \\ -\hspace{3mm}5.1 $x \leftarrow x - (n - k)$ \\ -\hspace{3mm}5.2 Goto step 1. \\ -6. Return $x$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Diminished Radix Reduction} -\label{fig:DR} -\end{figure} - -This algorithm will reduce $x$ modulo $n - k$ and return the residue. If $0 \le x < (n - k)^2$ then the algorithm will loop almost always -once or twice and occasionally three times. For simplicity sake the value of $x$ is bounded by the following simple polynomial. - -\begin{equation} -0 \le x < n^2 + k^2 - 2nk -\end{equation} - -The true bound is $0 \le x < (n - k - 1)^2$ but this has quite a few more terms. The value of $q$ after step 1 is bounded by the following. - -\begin{equation} -q < n - 2k - k^2/n -\end{equation} - -Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero. The value of $x$ after step 3 is bounded trivially as -$0 \le x < n$. By step four the sum $x + q$ is bounded by - -\begin{equation} -0 \le q + x < (k + 1)n - 2k^2 - 1 -\end{equation} - -With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3. After the second pass it is highly unlike that the -sum in step 4 will exceed $n - k$. In practice fewer than three passes of the algorithm are required to reduce virtually every input in the -range $0 \le x < (n - k - 1)^2$. - -\begin{figure} -\begin{small} -\begin{center} -\begin{tabular}{|l|} -\hline -$x = 123456789, n = 256, k = 3$ \\ -\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\ -$q \leftarrow q*k = 1446759$ \\ -$x \leftarrow x \mbox{ mod } n = 21$ \\ -$x \leftarrow x + q = 1446780$ \\ -$x \leftarrow x - (n - k) = 1446527$ \\ -\hline -$q \leftarrow \lfloor x/n \rfloor = 5650$ \\ -$q \leftarrow q*k = 16950$ \\ -$x \leftarrow x \mbox{ mod } n = 127$ \\ -$x \leftarrow x + q = 17077$ \\ -$x \leftarrow x - (n - k) = 16824$ \\ -\hline -$q \leftarrow \lfloor x/n \rfloor = 65$ \\ -$q \leftarrow q*k = 195$ \\ -$x \leftarrow x \mbox{ mod } n = 184$ \\ -$x \leftarrow x + q = 379$ \\ -$x \leftarrow x - (n - k) = 126$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example Diminished Radix Reduction} -\label{fig:EXDR} -\end{figure} - -Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$. Note that even while $x$ -is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast. In this case only -three passes were required to find the residue $x \equiv 126$. - - -\subsection{Choice of Moduli} -On the surface this algorithm looks like a very expensive algorithm. It requires a couple of subtractions followed by multiplication and other -modular reductions. The usefulness of this algorithm becomes exceedingly clear when an appropriate moduli is chosen. - -Division in general is a very expensive operation to perform. The one exception is when the division is by a power of the radix of representation used. -Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right. Similarly division -by two (\textit{or powers of two}) is very simple for binary computers to perform. It would therefore seem logical to choose $n$ of the form $2^p$ -which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits. - -However, there is one operation related to division of power of twos that is even faster than this. If $n = \beta^p$ then the division may be -performed by moving whole digits to the right $p$ places. In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$. -Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ requires zeroing the digits above the $p-1$'th digit of $x$. - -Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ where as the term ``unrestricted -modulus'' will refer to a modulus of the form $2^p - k$. The word ``restricted'' in this case refers to the fact that it is based on the -$2^p$ logic except $p$ must be a multiple of $lg(\beta)$. - -\subsection{Choice of $k$} -Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$ -in step 2 is the most expensive operation. Fortunately the choice of $k$ is not terribly limited. For all intents and purposes it might -as well be a single digit. The smaller the value of $k$ is the faster the algorithm will be. - -\subsection{Restricted Diminished Radix Reduction} -The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$. This algorithm can reduce -an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}. The implementation -of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition -of $x$ and $q$. The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular -exponentiations are performed. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_reduce}. \\ -\textbf{Input}. mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\ -\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\ -\textbf{Output}. $x \mbox{ mod } n$ \\ -\hline \\ -1. $m \leftarrow n.used$ \\ -2. If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\ -3. $\mu \leftarrow 0$ \\ -4. for $i$ from $0$ to $m - 1$ do \\ -\hspace{3mm}4.1 $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\ -\hspace{3mm}4.2 $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}4.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -5. $x_{m} \leftarrow \mu$ \\ -6. for $i$ from $m + 1$ to $x.used - 1$ do \\ -\hspace{3mm}6.1 $x_{i} \leftarrow 0$ \\ -7. Clamp excess digits of $x$. \\ -8. If $x \ge n$ then \\ -\hspace{3mm}8.1 $x \leftarrow x - n$ \\ -\hspace{3mm}8.2 Goto step 3. \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_reduce} -\end{figure} - -\textbf{Algorithm mp\_dr\_reduce.} -This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$. It has similar restrictions to that of the Barrett reduction -with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$. - -This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization. The division by $\beta^m$, multiplication by $k$ -and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4. The division by $\beta^m$ is emulated by accessing -the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position. After the loop the $m$'th -digit is set to the carry and the upper digits are zeroed. Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to -$x$ before the addition of the multiple of the upper half. - -At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required. First $n$ is subtracted from $x$ and then the algorithm resumes -at step 3. - -EXAM,bn_mp_dr_reduce.c - -The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$. The label on line @49,top:@ is where -the algorithm will resume if further reduction passes are required. In theory it could be placed at the top of the function however, the size of -the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time. - -The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits. By reading digits from $x$ offset by $m$ digits -a division by $\beta^m$ can be simulated virtually for free. The loop on line @61,for@ performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11}) -in this algorithm. - -By line @68,mu@ the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed. Similarly by line @71,for@ the -same pointer will point to the $m+1$'th digit where the zeroes will be placed. - -Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required. -With the same logic at line @82,sub@ the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used -as well. Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code -does not need to be checked. - -\subsubsection{Setup} -To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required. This algorithm is not really complicated but provided for -completeness. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_setup}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $k = \beta - n_0$ \\ -\hline \\ -1. $k \leftarrow \beta - n_0$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_setup} -\end{figure} - -EXAM,bn_mp_dr_setup.c - -\subsubsection{Modulus Detection} -Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus. An integer is said to be -of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $1$ if $n$ is in D.R form, $0$ otherwise \\ -\hline -1. If $n.used < 2$ then return($0$). \\ -2. for $ix$ from $1$ to $n.used - 1$ do \\ -\hspace{3mm}2.1 If $n_{ix} \ne \beta - 1$ return($0$). \\ -3. Return($1$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_is\_modulus} -\end{figure} - -\textbf{Algorithm mp\_dr\_is\_modulus.} -This algorithm determines if a value is in Diminished Radix form. Step 1 rejects obvious cases where fewer than two digits are -in the mp\_int. Step 2 tests all but the first digit to see if they are equal to $\beta - 1$. If the algorithm manages to get to -step 3 then $n$ must of Diminished Radix form. - -EXAM,bn_mp_dr_is_modulus.c - -\subsection{Unrestricted Diminished Radix Reduction} -The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$. This algorithm -is a straightforward adaptation of algorithm~\ref{fig:DR}. - -In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead. However, this new -algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_2k}. \\ -\textbf{Input}. mp\_int $a$ and $n$. mp\_digit $k$ \\ -\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\ -\textbf{Output}. $a \mbox{ (mod }n\mbox{)}$ \\ -\hline -1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ -2. While $a \ge n$ do \\ -\hspace{3mm}2.1 $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\ -\hspace{3mm}2.2 $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -\hspace{3mm}2.3 $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\ -\hspace{3mm}2.4 $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.5 If $a \ge n$ then do \\ -\hspace{6mm}2.5.1 $a \leftarrow a - n$ \\ -3. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_2k} -\end{figure} - -\textbf{Algorithm mp\_reduce\_2k.} -This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$. Division by $2^p$ is emulated with a right -shift which makes the algorithm fairly inexpensive to use. - -EXAM,bn_mp_reduce_2k.c - -The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$. The call to mp\_div\_2d -on line @31,mp_div_2d@ calculates both the quotient $q$ and the remainder $a$ required. By doing both in a single function call the code size -is kept fairly small. The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without -any multiplications. - -The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are -positive. By using the unsigned versions the overhead is kept to a minimum. - -\subsubsection{Unrestricted Setup} -To setup this reduction algorithm the value of $k = 2^p - n$ is required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $k = 2^p - n$ \\ -\hline -1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ -2. $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\ -3. $x \leftarrow x - n$ (\textit{mp\_sub}) \\ -4. $k \leftarrow x_0$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_2k\_setup} -\end{figure} - -\textbf{Algorithm mp\_reduce\_2k\_setup.} -This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k. By making a temporary variable $x$ equal to $2^p$ a subtraction -is sufficient to solve for $k$. Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$. - -EXAM,bn_mp_reduce_2k_setup.c - -\subsubsection{Unrestricted Detection} -An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true. - -\begin{enumerate} -\item The number has only one digit. -\item The number has more than one digit and every bit from the $\beta$'th to the most significant is one. -\end{enumerate} - -If either condition is true than there is a power of two namely $2^p$ such that $0 < 2^p - n < \beta$. If the input is only -one digit than it will always be of the correct form. Otherwise all of the bits above the first digit must be one. This arises from the fact -that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most -significant bit. The resulting sum will be a power of two. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $1$ if of proper form, $0$ otherwise \\ -\hline -1. If $n.used = 0$ then return($0$). \\ -2. If $n.used = 1$ then return($1$). \\ -3. $p \leftarrow \rceil lg(n) \lceil$ (\textit{mp\_count\_bits}) \\ -4. for $x$ from $lg(\beta)$ to $p$ do \\ -\hspace{3mm}4.1 If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\ -5. Return($1$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_is\_2k} -\end{figure} - -\textbf{Algorithm mp\_reduce\_is\_2k.} -This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly. - -EXAM,bn_mp_reduce_is_2k.c - - - -\section{Algorithm Comparison} -So far three very different algorithms for modular reduction have been discussed. Each of the algorithms have their own strengths and weaknesses -that makes having such a selection very useful. The following table sumarizes the three algorithms along with comparisons of work factors. Since -all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table. - -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\ -\hline Barrett & $m^2 + 2m - 1$ & None & $79$ & $1087$ & $4223$ \\ -\hline Montgomery & $m^2 + m$ & $n$ must be odd & $72$ & $1056$ & $4160$ \\ -\hline D.R. & $2m$ & $n = \beta^m - k$ & $16$ & $64$ & $128$ \\ -\hline -\end{tabular} -\end{small} -\end{center} - -In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete. However, in practice since Montgomery -reduction can be written as a single function with the Comba technique it is much faster. Barrett reduction suffers from the overhead of -calling the half precision multipliers, addition and division by $\beta$ algorithms. - -For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice. The one set of algorithms where Diminished Radix reduction truly -shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}. In these algorithms -primes of the form $\beta^m - k$ can be found and shared amongst users. These primes will allow the Diminished Radix algorithm to be used in -modular exponentiation to greatly speed up the operation. - - - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\ - & calculates the correct value of $\rho$. \\ - & \\ -$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly. \\ - & \\ -$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\ - & (\textit{figure~\ref{fig:DR}}) terminates. Also prove the probability that it will \\ - & terminate within $1 \le k \le 10$ iterations. \\ - & \\ -\end{tabular} - - -\chapter{Exponentiation} -Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$. A variant of exponentiation, computed -in a finite field or ring, is called modular exponentiation. This latter style of operation is typically used in public key -cryptosystems such as RSA and Diffie-Hellman. The ability to quickly compute modular exponentiations is of great benefit to any -such cryptosystem and many methods have been sought to speed it up. - -\section{Exponentiation Basics} -A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired. However, as $b$ grows in size -the number of multiplications becomes prohibitive. Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature -with a $1024$-bit key. Such a calculation could never be completed as it would take simply far too long. - -Fortunately there is a very simple algorithm based on the laws of exponents. Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which -are two trivial relationships between the base and the exponent. Let $b_i$ represent the $i$'th bit of $b$ starting from the least -significant bit. If $b$ is a $k$-bit integer than the following equation is true. - -\begin{equation} -a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i} -\end{equation} - -By taking the base $a$ logarithm of both sides of the equation the following equation is the result. - -\begin{equation} -b = \sum_{i=0}^{k-1}2^i \cdot b_i -\end{equation} - -The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to -$a^{2^{i+1}}$. This observation forms the basis of essentially all fast exponentiation algorithms. It requires $k$ squarings and on average -$k \over 2$ multiplications to compute the result. This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times. - -While this current method is a considerable speed up there are further improvements to be made. For example, the $a^{2^i}$ term does not need to -be computed in an auxilary variable. Consider the following equivalent algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Left to Right Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$ and $k$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $k - 1$ to $0$ do \\ -\hspace{3mm}2.1 $c \leftarrow c^2$ \\ -\hspace{3mm}2.2 $c \leftarrow c \cdot a^{b_i}$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Left to Right Exponentiation} -\label{fig:LTOR} -\end{figure} - -This algorithm starts from the most significant bit and works towards the least significant bit. When the $i$'th bit of $b$ is set $a$ is -multiplied against the current product. In each iteration the product is squared which doubles the exponent of the individual terms of the -product. - -For example, let $b = 101100_2 \equiv 44_{10}$. The following chart demonstrates the actions of the algorithm. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{|c|c|} -\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\ -\hline - & $1$ \\ -\hline $5$ & $a$ \\ -\hline $4$ & $a^2$ \\ -\hline $3$ & $a^4 \cdot a$ \\ -\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\ -\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\ -\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\ -\hline -\end{tabular} -\end{center} -\caption{Example of Left to Right Exponentiation} -\end{figure} - -When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation. This particular algorithm is -called ``Left to Right'' because it reads the exponent in that order. All of the exponentiation algorithms that will be presented are of this nature. - -\subsection{Single Digit Exponentiation} -The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit. It is intended -to be used when a small power of an input is required (\textit{e.g. $a^5$}). It is faster than simply multiplying $b - 1$ times for all values of -$b$ that are greater than three. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_expt\_d}. \\ -\textbf{Input}. mp\_int $a$ and mp\_digit $b$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $g \leftarrow a$ (\textit{mp\_init\_copy}) \\ -2. $c \leftarrow 1$ (\textit{mp\_set}) \\ -3. for $x$ from 1 to $lg(\beta)$ do \\ -\hspace{3mm}3.1 $c \leftarrow c^2$ (\textit{mp\_sqr}) \\ -\hspace{3mm}3.2 If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\ -\hspace{6mm}3.2.1 $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\ -\hspace{3mm}3.3 $b \leftarrow b << 1$ \\ -4. Clear $g$. \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_expt\_d} -\end{figure} - -\textbf{Algorithm mp\_expt\_d.} -This algorithm computes the value of $a$ raised to the power of a single digit $b$. It uses the left to right exponentiation algorithm to -quickly compute the exponentiation. It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the -exponent is a fixed width. - -A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$. The result is set to the initial value of -$1$ in the subsequent step. - -Inside the loop the exponent is read from the most significant bit first down to the least significant bit. First $c$ is invariably squared -on step 3.1. In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$. The value -of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit. In effect each -iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location. - -EXAM,bn_mp_expt_d.c - --- Some note later. - -\section{$k$-ary Exponentiation} -When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor -slower than squaring. Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$. Suppose instead it referred to -the $i$'th $k$-bit digit of the exponent of $b$. For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY} -computes the same exponentiation. A group of $k$ bits from the exponent is called a \textit{window}. That is it is a small window on only a -portion of the entire exponent. Consider the following modification to the basic left to right exponentiation algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{$k$-ary Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $t - 1$ to $0$ do \\ -\hspace{3mm}2.1 $c \leftarrow c^{2^k} $ \\ -\hspace{3mm}2.2 Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\ -\hspace{3mm}2.3 $c \leftarrow c \cdot a^g$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{$k$-ary Exponentiation} -\label{fig:KARY} -\end{figure} - -The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times. If the values of $a^g$ for $0 < g < 2^k$ have been -precomputed this algorithm requires only $t$ multiplications and $tk$ squarings. The table can be generated with $2^{k - 1} - 1$ squarings and -$2^{k - 1} + 1$ multiplications. This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$. -However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}. - -Suppose $k = 4$ and $t = 100$. This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation. The -original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value. The total number of squarings -has increased slightly but the number of multiplications has nearly halved. - -\subsection{Optimal Values of $k$} -An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$. The simplest -approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result. Table~\ref{fig:OPTK} lists optimal values of $k$ -for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}. - -\begin{figure}[here] -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\ -\hline $16$ & $2$ & $27$ & $24$ \\ -\hline $32$ & $3$ & $49$ & $48$ \\ -\hline $64$ & $3$ & $92$ & $96$ \\ -\hline $128$ & $4$ & $175$ & $192$ \\ -\hline $256$ & $4$ & $335$ & $384$ \\ -\hline $512$ & $5$ & $645$ & $768$ \\ -\hline $1024$ & $6$ & $1257$ & $1536$ \\ -\hline $2048$ & $6$ & $2452$ & $3072$ \\ -\hline $4096$ & $7$ & $4808$ & $6144$ \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Optimal Values of $k$ for $k$-ary Exponentiation} -\label{fig:OPTK} -\end{figure} - -\subsection{Sliding-Window Exponentiation} -A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$. Essentially -this is a table for all values of $g$ where the most significant bit of $g$ is a one. However, in order for this to be allowed in the -algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided. - -Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}. - -\begin{figure}[here] -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\ -\hline $16$ & $3$ & $24$ & $27$ \\ -\hline $32$ & $3$ & $45$ & $49$ \\ -\hline $64$ & $4$ & $87$ & $92$ \\ -\hline $128$ & $4$ & $167$ & $175$ \\ -\hline $256$ & $5$ & $322$ & $335$ \\ -\hline $512$ & $6$ & $628$ & $645$ \\ -\hline $1024$ & $6$ & $1225$ & $1257$ \\ -\hline $2048$ & $7$ & $2403$ & $2452$ \\ -\hline $4096$ & $8$ & $4735$ & $4808$ \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Optimal Values of $k$ for Sliding Window Exponentiation} -\label{fig:OPTK2} -\end{figure} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $t - 1$ to $0$ do \\ -\hspace{3mm}2.1 If the $i$'th bit of $b$ is a zero then \\ -\hspace{6mm}2.1.1 $c \leftarrow c^2$ \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c \leftarrow c^{2^k}$ \\ -\hspace{6mm}2.2.2 Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\ -\hspace{6mm}2.2.3 $c \leftarrow c \cdot a^g$ \\ -\hspace{6mm}2.2.4 $i \leftarrow i - k$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Sliding Window $k$-ary Exponentiation} -\end{figure} - -Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent. While this -algorithm requires the same number of squarings it can potentially have fewer multiplications. The pre-computed table $a^g$ is also half -the size as the previous table. - -Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms. The first algorithm will divide the exponent up as -the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$. The second algorithm will break the -exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$. The single digit $0$ in the second representation are where -a single squaring took place instead of a squaring and multiplication. In total the first method requires $10$ multiplications and $18$ -squarings. The second method requires $8$ multiplications and $18$ squarings. - -In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster. - -\section{Modular Exponentiation} - -Modular exponentiation is essentially computing the power of a base within a finite field or ring. For example, computing -$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation. Instead of first computing $a^b$ and then reducing it -modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation. - -This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using -one of the algorithms presented in ~REDUCTION~. - -Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first. This algorithm -will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The -value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see ~MODINV~}). If no inverse exists the algorithm -terminates with an error. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_exptmod}. \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -1. If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ -2. If $b.sign = MP\_NEG$ then \\ -\hspace{3mm}2.1 $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\ -\hspace{3mm}2.2 $x' \leftarrow \vert x \vert$ \\ -\hspace{3mm}2.3 Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\ -3. if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\ -\hspace{3mm}3.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\ -4. else \\ -\hspace{3mm}4.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_exptmod} -\end{figure} - -\textbf{Algorithm mp\_exptmod.} -The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod. It is a sliding window $k$-ary algorithm -which uses Barrett reduction to reduce the product modulo $p$. The second algorithm mp\_exptmod\_fast performs the same operation -except it uses either Montgomery or Diminished Radix reduction. The two latter reduction algorithms are clumped in the same exponentiation -algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}). - -EXAM,bn_mp_exptmod.c - -In order to keep the algorithms in a known state the first step on line @29,if@ is to reject any negative modulus as input. If the exponent is -negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$. The temporary variable $tmpG$ is assigned -the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$. The algorithm will recuse with these new values with a positive -exponent. - -If the exponent is positive the algorithm resumes the exponentiation. Line @63,dr_@ determines if the modulus is of the restricted Diminished Radix -form. If it is not line @65,reduce@ attempts to determine if it is of a unrestricted Diminished Radix form. The integer $dr$ will take on one -of three values. - -\begin{enumerate} -\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form. -\item $dr = 1$ means that the modulus is of restricted Diminished Radix form. -\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form. -\end{enumerate} - -Line @69,if@ determines if the fast modular exponentiation algorithm can be used. It is allowed if $dr \ne 0$ or if the modulus is odd. Otherwise, -the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction. - -\subsection{Barrett Modular Exponentiation} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_exptmod}. \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -1. $k \leftarrow lg(x)$ \\ -2. $winsize \leftarrow \left \lbrace \begin{array}{ll} - 2 & \mbox{if }k \le 7 \\ - 3 & \mbox{if }7 < k \le 36 \\ - 4 & \mbox{if }36 < k \le 140 \\ - 5 & \mbox{if }140 < k \le 450 \\ - 6 & \mbox{if }450 < k \le 1303 \\ - 7 & \mbox{if }1303 < k \le 3529 \\ - 8 & \mbox{if }3529 < k \\ - \end{array} \right .$ \\ -3. Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\ -4. Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\ -5. $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\ -\\ -Setup the table of small powers of $g$. First find $g^{2^{winsize}}$ and then all multiples of it. \\ -6. $k \leftarrow 2^{winsize - 1}$ \\ -7. $M_{k} \leftarrow M_1$ \\ -8. for $ix$ from 0 to $winsize - 2$ do \\ -\hspace{3mm}8.1 $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr}) \\ -\hspace{3mm}8.2 $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ -9. for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\ -\hspace{3mm}9.1 $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\ -\hspace{3mm}9.2 $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ -10. $res \leftarrow 1$ \\ -\\ -Start Sliding Window. \\ -11. $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\ -12. Loop \\ -\hspace{3mm}12.1 $bitcnt \leftarrow bitcnt - 1$ \\ -\hspace{3mm}12.2 If $bitcnt = 0$ then do \\ -\hspace{6mm}12.2.1 If $digidx = -1$ goto step 13. \\ -\hspace{6mm}12.2.2 $buf \leftarrow x_{digidx}$ \\ -\hspace{6mm}12.2.3 $digidx \leftarrow digidx - 1$ \\ -\hspace{6mm}12.2.4 $bitcnt \leftarrow lg(\beta)$ \\ -Continued on next page. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_exptmod} -\end{figure} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -\hspace{3mm}12.3 $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\ -\hspace{3mm}12.4 $buf \leftarrow buf << 1$ \\ -\hspace{3mm}12.5 if $mode = 0$ and $y = 0$ then goto step 12. \\ -\hspace{3mm}12.6 if $mode = 1$ and $y = 0$ then do \\ -\hspace{6mm}12.6.1 $res \leftarrow res^2$ \\ -\hspace{6mm}12.6.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}12.6.3 Goto step 12. \\ -\hspace{3mm}12.7 $bitcpy \leftarrow bitcpy + 1$ \\ -\hspace{3mm}12.8 $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\ -\hspace{3mm}12.9 $mode \leftarrow 2$ \\ -\hspace{3mm}12.10 If $bitcpy = winsize$ then do \\ -\hspace{6mm}Window is full so perform the squarings and single multiplication. \\ -\hspace{6mm}12.10.1 for $ix$ from $0$ to $winsize -1$ do \\ -\hspace{9mm}12.10.1.1 $res \leftarrow res^2$ \\ -\hspace{9mm}12.10.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}12.10.2 $res \leftarrow res \cdot M_{bitbuf}$ \\ -\hspace{6mm}12.10.3 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}Reset the window. \\ -\hspace{6mm}12.10.4 $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\ -\\ -No more windows left. Check for residual bits of exponent. \\ -13. If $mode = 2$ and $bitcpy > 0$ then do \\ -\hspace{3mm}13.1 for $ix$ form $0$ to $bitcpy - 1$ do \\ -\hspace{6mm}13.1.1 $res \leftarrow res^2$ \\ -\hspace{6mm}13.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}13.1.3 $bitbuf \leftarrow bitbuf << 1$ \\ -\hspace{6mm}13.1.4 If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\ -\hspace{9mm}13.1.4.1 $res \leftarrow res \cdot M_{1}$ \\ -\hspace{9mm}13.1.4.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -14. $y \leftarrow res$ \\ -15. Clear $res$, $mu$ and the $M$ array. \\ -16. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_exptmod (continued)} -\end{figure} - -\textbf{Algorithm s\_mp\_exptmod.} -This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$. It takes advantage of the Barrett reduction -algorithm to keep the product small throughout the algorithm. - -The first two steps determine the optimal window size based on the number of bits in the exponent. The larger the exponent the -larger the window size becomes. After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated. This -table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$. - -After the table is allocated the first power of $g$ is found. Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make -the rest of the algorithm more efficient. The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$ -times. The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$. - -Now that the table is available the sliding window may begin. The following list describes the functions of all the variables in the window. -\begin{enumerate} -\item The variable $mode$ dictates how the bits of the exponent are interpreted. -\begin{enumerate} - \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet. For example, if the exponent were simply - $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit. In this case bits are ignored until a non-zero bit is found. - \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet. In this mode leading $0$ bits - are read and a single squaring is performed. If a non-zero bit is read a new window is created. - \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit - downwards. -\end{enumerate} -\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read. When it reaches zero a new digit - is fetched from the exponent. -\item The variable $buf$ holds the currently read digit of the exponent. -\item The variable $digidx$ is an index into the exponents digits. It starts at the leading digit $x.used - 1$ and moves towards the trailing digit. -\item The variable $bitcpy$ indicates how many bits are in the currently formed window. When it reaches $winsize$ the window is flushed and - the appropriate operations performed. -\item The variable $bitbuf$ holds the current bits of the window being formed. -\end{enumerate} - -All of step 12 is the window processing loop. It will iterate while there are digits available form the exponent to read. The first step -inside this loop is to extract a new digit if no more bits are available in the current digit. If there are no bits left a new digit is -read and if there are no digits left than the loop terminates. - -After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit -upwards. In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to -trailing edges the entire exponent is read from most significant bit to least significant bit. - -At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read. This prevents the -algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read. Step 12.6 and 12.7-10 handle -the two cases of $mode = 1$ and $mode = 2$ respectively. - -FIGU,expt_state,Sliding Window State Diagram - -By step 13 there are no more digits left in the exponent. However, there may be partial bits in the window left. If $mode = 2$ then -a Left-to-Right algorithm is used to process the remaining few bits. - -EXAM,bn_s_mp_exptmod.c - -Lines @26,if@ through @40,}@ determine the optimal window size based on the length of the exponent in bits. The window divisions are sorted -from smallest to greatest so that in each \textbf{if} statement only one condition must be tested. For example, by the \textbf{if} statement -on line @32,if@ the value of $x$ is already known to be greater than $140$. - -The conditional piece of code beginning on line @42,define@ allows the window size to be restricted to five bits. This logic is used to ensure -the table of precomputed powers of $G$ remains relatively small. - -The for loop on line @49,for@ initializes the $M$ array while lines @59,mp_init@ and @62,mp_reduce@ compute the value of $\mu$ required for -Barrett reduction. - --- More later. - -\section{Quick Power of Two} -Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms. Recall that a logical shift left $m << k$ is -equivalent to $m \cdot 2^k$. By this logic when $m = 1$ a quick power of two can be achieved. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_2expt}. \\ -\textbf{Input}. integer $b$ \\ -\textbf{Output}. $a \leftarrow 2^b$ \\ -\hline \\ -1. $a \leftarrow 0$ \\ -2. If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\ -3. $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\ -4. $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_2expt} -\end{figure} - -\textbf{Algorithm mp\_2expt.} - -EXAM,bn_mp_2expt.c - -\chapter{Higher Level Algorithms} - -This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package. These -routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important. - -The first section describes a method of integer division with remainder that is universally well known. It provides the signed division logic -for the package. The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations. -These algorithms serve mostly to simplify other algorithms where small constants are required. The last two sections discuss how to manipulate -various representations of integers. For example, converting from an mp\_int to a string of character. - -\section{Integer Division with Remainder} -MARK,DIVISION - -Integer division aside from modular exponentiation is most intensive algorithm to compute. - - -\section{Single Digit Helpers} -\subsection{Single Digit Addition} -\subsection{Single Digit Subtraction} -\subsection{Single Digit Multiplication} -\subsection{Single Digit Division} -\subsection{Single Digit Modulo} -\subsection{Single Digit Root Extraction} -\section{Random Number Generation} -\section{Formatted Output} -\subsection{Getting The Output Size} -\subsection{Generating Radix-n Output} -\subsection{Reading Radix-n Input} -\section{Unformatted Output} -\subsection{Getting The Output Size} -\subsection{Generating Output} -\subsection{Reading Input} - -\chapter{Number Theoretic Algorithms} -\section{Greatest Common Divisor} -\section{Least Common Multiple} -\section{Jacobi Symbol Computation} -\section{Modular Inverse} -MARK,MODINV -\subsection{General Case} -\subsection{Odd Moduli} -\section{Primality Tests} -\subsection{Trial Division} -\subsection{The Fermat Test} -\subsection{The Miller-Rabin Test} -\subsection{Primality Test in a Bottle} -\subsection{The Next Prime} -\section{Root Extraction} - -\backmatter -\appendix -\begin{thebibliography}{ABCDEF} -\bibitem[1]{TAOCPV2} -Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 - -\bibitem[2]{HAC} -A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 - -\bibitem[3]{ROSE} -Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 - -\bibitem[4]{COMBA} -Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) - -\bibitem[5]{KARA} -A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 - -\bibitem[6]{KARAP} -Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 - -\bibitem[7]{BARRETT} -Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag. - -\bibitem[8]{MONT} -P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985. - -\bibitem[9]{DRMET} -Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories - -\bibitem[10]{MMB} -J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89 - -\end{thebibliography} - -\input{tommath.ind} - -\chapter{Appendix} -\subsection*{Appendix A -- Source Listing of tommath.h} - -The following is the source listing of the header file ``tommath.h'' for the LibTomMath project. It contains many of -the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on. The header is -presented here for completeness. - -LIST,tommath.h - -\end{document} \ No newline at end of file diff --git a/tommath.tex b/tommath.tex deleted file mode 100644 index cd2a97e..0000000 --- a/tommath.tex +++ /dev/null @@ -1,8141 +0,0 @@ -\documentclass[b5paper]{book} -\usepackage{hyperref} -\usepackage{makeidx} -\usepackage{amssymb} -\usepackage{color} -\usepackage{alltt} -\usepackage{graphicx} -\usepackage{layout} -\def\union{\cup} -\def\intersect{\cap} -\def\getsrandom{\stackrel{\rm R}{\gets}} -\def\cross{\times} -\def\cat{\hspace{0.5em} \| \hspace{0.5em}} -\def\catn{$\|$} -\def\divides{\hspace{0.3em} | \hspace{0.3em}} -\def\nequiv{\not\equiv} -\def\approx{\raisebox{0.2ex}{\mbox{\small $\sim$}}} -\def\lcm{{\rm lcm}} -\def\gcd{{\rm gcd}} -\def\log{{\rm log}} -\def\ord{{\rm ord}} -\def\abs{{\mathit abs}} -\def\rep{{\mathit rep}} -\def\mod{{\mathit\ mod\ }} -\renewcommand{\pmod}[1]{\ ({\rm mod\ }{#1})} -\newcommand{\floor}[1]{\left\lfloor{#1}\right\rfloor} -\newcommand{\ceil}[1]{\left\lceil{#1}\right\rceil} -\def\Or{{\rm\ or\ }} -\def\And{{\rm\ and\ }} -\def\iff{\hspace{1em}\Longleftrightarrow\hspace{1em}} -\def\implies{\Rightarrow} -\def\undefined{{\rm ``undefined"}} -\def\Proof{\vspace{1ex}\noindent {\bf Proof:}\hspace{1em}} -\let\oldphi\phi -\def\phi{\varphi} -\def\Pr{{\rm Pr}} -\newcommand{\str}[1]{{\mathbf{#1}}} -\def\F{{\mathbb F}} -\def\N{{\mathbb N}} -\def\Z{{\mathbb Z}} -\def\R{{\mathbb R}} -\def\C{{\mathbb C}} -\def\Q{{\mathbb Q}} -\definecolor{DGray}{gray}{0.5} -\newcommand{\emailaddr}[1]{\mbox{$<${#1}$>$}} -\def\twiddle{\raisebox{0.3ex}{\mbox{\tiny $\sim$}}} -\def\gap{\vspace{0.5ex}} -\makeindex -\begin{document} -\frontmatter -\pagestyle{empty} -\title{Multiple-Precision Integer Arithmetic, \\ A Case Study Involving the LibTomMath Project \\ - DRAFT - } -\author{\mbox{ -%\begin{small} -\begin{tabular}{c} -Tom St Denis \\ -Algonquin College \\ -\\ -Mads Rasmussen \\ -Open Communications Security \\ -\\ -Greg Rose \\ -QUALCOMM Australia \\ -\end{tabular} -%\end{small} -} -} -\maketitle -This text in its entirety is copyright \copyright{}2003 by Tom St Denis. It may not be redistributed -electronically or otherwise without the sole permission of the author. The text is freely redistributable as long as -it is packaged along with the LibTomMath library in a non-commercial project. Contact the -author for other redistribution rights. - -This text corresponds to the v0.17 release of the LibTomMath project. - -\begin{alltt} -Tom St Denis -111 Banning Rd -Ottawa, Ontario -K2L 1C3 -Canada - -Phone: 1-613-836-3160 -Email: tomstdenis@iahu.ca -\end{alltt} - -This text is formatted to the international B5 paper size of 176mm wide by 250mm tall using the \LaTeX{} -{\em book} macro package and the Perl {\em booker} package. - -\tableofcontents -\listoffigures -\chapter*{Preface} -Blah. - -\mainmatter -\pagestyle{headings} -\chapter{Introduction} -\section{Multiple Precision Arithmetic} -\subsection{The Need for Multiple Precision Arithmetic} -The most prevalent use for multiple precision arithmetic (\textit{often referred to as bignum math}) is within public -key cryptography. Algorithms such as RSA, Diffie-Hellman and Elliptic Curve Cryptography require large integers in order to -resist known cryptanalytic attacks. Typical modern programming languages such as C and Java only provide small -single-precision data types which are incapable of precisely representing integers which are often hundreds of bits long. - -For example, consider multiplying $1,234,567$ by $9,876,543$ in C with an ``unsigned long'' data type. With an -x86 machine the result is $4,136,875,833$ while the true result is $12,193,254,061,881$. The original inputs -were approximately $21$ and $24$ bits respectively. If the C language cannot multiply two relatively small values -together precisely how does anyone expect it to multiply two values that are considerably larger? - -Most advancements in fast multiple precision arithmetic stem from the desire for faster cryptographic primitives. However, cryptography -is not the only field of study that can benefit from fast large integer routines. Another auxiliary use for multiple precision integers is -high precision floating point data types. The basic IEEE standard floating point type is made up of an integer mantissa $q$ and an exponent $e$. -Numbers are given in the form $n = q \cdot b^e$ where $b = 2$ is specified. Since IEEE is meant to be implemented in -hardware the precision of the mantissa is often fairly small (\textit{23, 48 and 64 bits}). Since the mantissa is merely an -integer a large multiple precision integer could be used. In effect very high precision floating point arithmetic -could be performed. This would be useful where scientific applications must minimize the total output error over long simulations. - -\subsection{Multiple Precision Arithmetic} -\index{multiple precision} -Multiple precision arithmetic attempts to the solve the shortcomings of single precision data types such as those from -the C and Java programming languages. In essence multiple precision arithmetic is a set of operations that can be -performed on members of an algebraic group whose precision is not fixed. The algorithms when implemented to be multiple -precision can allow a developer to work with any practical precision required. - -Typically the arithmetic over the ring of integers denoted by $\Z$ is performed by routines that are collectively and -casually referred to as ``bignum'' routines. However, it is possible to have rings of polynomials as well typically -denoted by $\Z/p\Z \left [ X \right ]$ which could have variable precision (\textit{or degree}). This text will -discuss implementation of the former, however implementing polynomial basis routines should be relatively easy after reading this text. - -\subsection{Benefits of Multiple Precision Arithmetic} -\index{precision} \index{accuracy} -Precision of the real value to a given precision is defined loosely as the proximity of the real value to a given representation. -Accuracy is defined as the reproducibility of the result. For example, the calculation $1/3 = 0.25$ is imprecise but can be accurate provided -it is reproducible. - -The benefit of multiple precision representations over single precision representations is that -often no precision is lost while representing the result of an operation which requires excess precision. For example, -the multiplication of two $n$-bit integers requires at least $2n$ bits to represent the result. A multiple precision -system would augment the precision of the destination to accomodate the result while a single precision system would -truncate excess bits to maintain a fixed level of precision. - -Multiple precision representations allow for the precision to be very high (\textit{if not exacting}) but at a cost of -modest computer resources. The only reasonable case where a multiple precision system will lose precision is when -emulating a floating point data type. However, with multiple precision integer arithmetic no precision is lost. - -\subsection{Basis of Operations} -At the heart of all multiple precision integer operations are the ``long-hand'' algorithms we all learned as children -in grade school. For example, to multiply $1,234$ by $981$ the student is not taught to memorize the times table for -$1,234$, instead they are taught how to long-multiply. That is to multiply each column using simple single digit -multiplications, line up the partial results, and add the resulting products by column. The representation that most -are familiar with is known as decimal or formally as radix-10. A radix-$n$ representation simply means there are -$n$ possible values per digit. For example, binary would be a radix-2 representation. - -In essence computer based multiple precision arithmetic is very much the same. The most notable difference is the usage -of a binary friendly radix. That is to use a radix of the form $2^k$ where $k$ is typically the size of a machine -register. Also occasionally more optimal algorithms are used to perform certain operations such as multiplication and -squaring instead of traditional long-hand algorithms. - -\section{Purpose of This Text} -The purpose of this text is to instruct the reader regarding how to implement multiple precision algorithms. That is -to not only explain the core theoretical algorithms but also the various ``house keeping'' tasks that are neglected by -authors of other texts on the subject. Texts such as \cite[HAC]{HAC} and \cite{TAOCPV2} give considerably detailed -explanations of the theoretical aspects of the algorithms and very little regarding the practical aspects. - -How an algorithm is explained and how it is actually implemented are two very different -realities. For example, algorithm 14.7 on page 594 of HAC lists a relatively simple algorithm for performing multiple -precision integer addition. However, what the description lacks is any discussion concerning the fact that the two -integer inputs may be of differing magnitudes. Similarly the division routine (\textit{Algorithm 14.20, pp. 598}) -does not discuss how to handle sign or handle the dividend's decreasing magnitude in the main loop (\textit{Step \#3}). - -As well as the numerous practical oversights both of the texts do not discuss several key optimal algorithms required -such as ``Comba'' and Karatsuba multipliers and fast modular inversion. These optimal algorithms are vital to achieve -any form of useful performance in non-trivial applications. - -To solve this problem the focus of this text is on the practical aspects of implementing the algorithms that -constitute a multiple precision integer package with light discussions on the theoretical aspects. As a case -study the ``LibTomMath''\footnote{Available freely at http://math.libtomcrypt.org} package is used to demonstrate -algorithms with implementations that have been field tested and work very well. - -\section{Discussion and Notation} -\subsection{Notation} -A multiple precision integer of $n$-digits shall be denoted as $x = (x_n ... x_1 x_0)_{ \beta }$ to be the -multiple precision notation for the integer $x \equiv \sum_{i=0}^{n} x_i\beta^i$. The elements of the array $x$ are -said to be the radix $\beta$ digits of the integer. For example, $x = (1,2,3)_{10}$ would represent the -integer $1\cdot 10^2 + 2\cdot10^1 + 3\cdot10^0 = 123$. - -A ``mp\_int'' shall refer to a composite structure which contains the digits of the integer as well as auxilary data -required to manipulate the data. These additional members are discussed in chapter three. For the purposes of this text -a ``multiple precision integer'' and a ``mp\_int'' are synonymous. - -\index{single-precision} \index{double-precision} \index{mp\_digit} \index{mp\_word} -For the purposes of this text a single-precision variable must be able to represent integers in the range $0 \le x < 2 \beta$ while -a double-precision variable must be able to represent integers in the range $0 \le x < 2 \beta^2$. Within the source code that will be -presented the data type \textbf{mp\_digit} will represent a single-precision type while \textbf{mp\_word} will represent a -double-precision type. In several algorithms (\textit{notably the Comba routines}) temporary results -will be stored in a double-precision arrays. For the purposes of this text $x_j$ will refer to the -$j$'th digit of a single-precision array and $\hat x_j$ will refer to the $j$'th digit of a double-precision -array. - -The $\lfloor \mbox{ } \rfloor$ brackets represent a value truncated and rounded down to the nearest integer. The $\lceil \mbox{ } \rceil$ brackets -represent a value truncated and rounded up to the nearest integer. Typically when the $/$ division symbol is used the intention is to perform an integer -division. For example, $5/2 = 2$ which will often be written as $\lfloor 5/2 \rfloor = 2$ for clarity. When a value is presented as a fraction -such as $5 \over 2$ a real value division is implied. - -\subsection{Work Effort} -\index{big-O} -To measure the efficiency of various algorithms a modified big-O notation is used. In this system all -single precision operations are considered to have the same cost\footnote{Except where explicitly noted.}. -That is a single precision addition, multiplication and division are assumed to take the same time to -complete. While this is generally not true in practice it will simplify the discussions considerably. - -Some algorithms have slight advantages over others which is why some constants will not be removed in -the notation. For example, a normal multiplication requires $O(n^2)$ work while a squaring requires -$O({{n^2 + n}\over 2})$ work. In standard big-O notation these would be said to be equivalent. However, in the -context of the this text the magnitude of the inputs will not approach an infinite size. This means the conventional limit -notation wisdom does not apply to the cancellation of constants. - -Throughout the discussions various ``work levels'' will be discussed. These levels are the $O(1)$, -$O(n)$, $O(n^2)$, ..., $O(n^k)$ work efforts. For example, operations at the $O(n^k)$ ``level'' are said to be -executed more frequently than operations at the $O(n^m)$ ``level'' when $k > m$. Obviously most optimizations will pay -off the most at the higher levels since they represent the bulk of the effort required. - -\section{Exercises} -Within the more advanced chapters a section will be set aside to give the reader some challenging exercises. These exercises are not -designed to be prize winning problems, but to be thought provoking. Wherever possible the problems are forward minded stating -problems that will be answered in subsequent chapters. The reader is encouraged to finish the exercises as they appear to get a -better understanding of the subject material. - -Similar to the exercises of \cite{TAOCPV2} as explained on pp.\textit{ix} these exercises are given a scoring system. However, unlike -\cite{TAOCPV2} the problems do not get nearly as hard as often. The scoring of these exercises ranges from one (\textit{the easiest}) to -five (\textit{the hardest}). The following table sumarizes the scoring. - -\vspace{5mm} -\begin{tabular}{cl} -$\left [ 1 \right ]$ & An easy problem that should only take the reader a manner of \\ - & minutes to solve. Usually does not involve much computer time. \\ - & \\ -$\left [ 2 \right ]$ & An easy problem that involves a marginal amount of computer \\ - & time usage. Usually requires a program to be written to \\ - & solve the problem. \\ - & \\ -$\left [ 3 \right ]$ & A moderately hard problem that requires a non-trivial amount \\ - & of work. Usually involves trivial research and development of \\ - & new theory from the perspective of a student. \\ - & \\ -$\left [ 4 \right ]$ & A moderately hard problem that involves a non-trivial amount \\ - & of work and research. The solution to which will demonstrate \\ - & a higher mastery of the subject matter. \\ - & \\ -$\left [ 5 \right ]$ & A hard problem that involves concepts that are non-trivial. \\ - & Solutions to these problems will demonstrate a complete mastery \\ - & of the given subject. \\ - & \\ -\end{tabular} - -Essentially problems at the first level are meant to be simple questions that the reader can answer quickly without programming a solution or -devising new theory. These problems are quick tests to see if the material is understood. Problems at the second level are also -designed to be easy but will require a program or algorithm to be implemented to arrive at the answer. - -Problems at the third level are meant to be a bit more difficult. Often the answer is fairly obvious but arriving at an exacting solution -requires some thought and skill. These problems will almost always involve devising a new algorithm or implementing a variation of -another algorithm. - -Problems at the fourth level are meant to be even more difficult as well as involve some research. The reader will most likely not know -the answer right away nor will this text provide the exact details of the answer (\textit{or at least not until a subsequent chapter}). Problems -at the fifth level are meant to be the hardest problems relative to all the other problems in the chapter. People who can correctly -answer fifth level problems have a mastery of the subject matter at hand. - -Often problems will be tied together. The purpose of this is to start a chain of thought that will be discussed in future chapters. The reader -is encouraged to answer the follow-up problems and try to draw the relevence of problems. - -\chapter{Introduction to LibTomMath} - -\section{What is LibTomMath?} -LibTomMath is a free and open source multiple precision library written in portable ISO C source code. By portable it is -meant that the library does not contain any code that is computer platform dependent or otherwise problematic to use on any -given platform. The library has been successfully tested under numerous operating systems including Solaris, MacOS, Windows, -Linux, PalmOS and on standalone hardware such as the Gameboy Advance. The library is designed to contain enough -functionality to be able to develop applications such as public key cryptosystems. - -\section{Goals of LibTomMath} - -Even though the library is written entirely in portable ISO C considerable care has been taken to -optimize the algorithm implementations within the library. Specifically the code has been written to work well with -the GNU C Compiler (\textit{GCC}) on both x86 and ARMv4 processors. Wherever possible highly efficient -algorithms (\textit{such as Karatsuba multiplication, sliding window exponentiation and Montgomery reduction}) have -been provided to make the library as efficient as possible. Even with the optimal and sometimes specialized -algorithms that have been included the Application Programing Interface (\textit{API}) has been kept as simple as possible. -Often generic place holder routines will make use of specialized algorithms automatically without the developer's -attention. One such example is the generic multiplication algorithm \textbf{mp\_mul()} which will automatically use -Karatsuba multiplication if the inputs are of a specific size. - -Making LibTomMath as efficient as possible is not the only goal of the LibTomMath project. Ideally the library should -be source compatible with another popular library which makes it more attractive for developers to use. In this case the -MPI library was used as a API template for all the basic functions. - -The project is also meant to act as a learning tool for students. The logic being that no easy-to-follow ``bignum'' -library exists which can be used to teach computer science students how to perform fast and reliable multiple precision -arithmetic. To this end the source code has been given quite a few comments and algorithm discussion points. Often routines have -more comments than lines of code. - -\section{Choice of LibTomMath} -LibTomMath was chosen as the case study of this text not only because the author of both projects is one and the same but -for more worthy reasons. Other libraries such as GMP, MPI, LIP and OpenSSL have multiple precision -integer arithmetic routines but would not be ideal for this text for reasons as will be explained in the -following sub-sections. - -\subsection{Code Base} -The LibTomMath code base is all portable ISO C source code. This means that there are no platform dependent conditional -segments of code littered throughout the source. This clean and uncluttered approach to the library means that a -developer can more readily ascertain the true intent of a given section of source code without trying to keep track of -what conditional code will be used. - -The code base of LibTomMath is also well organized. Each function is in its own separate source code file -which allows the reader to find a given function very fast. When compiled with GCC for the x86 processor the entire -library is a mere 87,760 bytes (\textit{$116,182$ bytes for ARMv4 processors}). This includes every single function -LibTomMath provides from basic arithmetic to various number theoretic functions such as modular exponentiation, various -reduction algorithms and Jacobi symbol computation. - -By comparison MPI which has fewer functions than LibTomMath compiled with the same conditions is 45,429 bytes -(\textit{$54,536$ for ARMv4}). GMP which has rather large collection of functions with the default configuration on an -x86 Athlon is 2,950,688 bytes. Note that while LibTomMath has fewer functions than GMP it has been used as the sole basis -for several public key cryptosystems without having to seek additional outside functions to supplement the library. - -\subsection{API Simplicity} -LibTomMath is designed after the MPI library and shares the API design. Quite often programs that use MPI will build -with LibTomMath without change. The function names are relatively straight forward as to what they perform. Almost all of the -functions except for a few minor exceptions which as will be discussed are for good reasons share the same parameter passing -convention. The learning curve is fairly shallow with the API provided which is an extremely valuable benefit for the -student and developer alike. - -The LIP library is an example of a library with an API that is awkward to work with. LIP uses function names that are often ``compressed'' to -illegible short hand. LibTomMath does not share this fault. - -\subsection{Optimizations} -While LibTomMath is certainly not the fastest library (\textit{GMP often beats LibTomMath by a factor of two}) it does -feature a set of optimal algorithms for tasks ranging from modular reduction to squaring. GMP and LIP also feature -such optimizations while MPI only uses baseline algorithms with no optimizations. - -LibTomMath is almost always an order of magnitude faster than the MPI library at computationally expensive tasks such as modular -exponentiation. In the grand scheme of ``bignum'' libraries LibTomMath is faster than the average library and usually -slower than the best libraries such as GMP and OpenSSL by a small factor. - -\subsection{Portability and Stability} -LibTomMath will build ``out of the box'' on any platform equipped with a modern version of the GNU C Compiler -(\textit{GCC}). This means that without changes the library will build without configuration or setting up any -variables. LIP and MPI will build ``out of the box'' as well but have numerous known bugs. Most notably the author of -MPI is not working on his library anymore. - -GMP requires a configuration script to run and will not build out of the box. GMP and LibTomMath are still in active -development and are very stable across a variety of platforms. - -\subsection{Choice} -LibTomMath is a relatively compact, well documented, highly optimized and portable library which seems only natural for -the case study of this text. Various source files from the LibTomMath project will be included within the text. However, the -reader is encouraged to download their own copy of the library to actually be able to work with the library. - -\chapter{Getting Started} -\section{Library Basics} -To begin the design of a multiple precision integer library a primitive data type and a series of primitive algorithms must be established. A data -type that will hold the information required to maintain a multiple precision integer must be designed. With this basic data type of a series -of low level algorithms for initializing, clearing, growing and optimizing multiple precision integers can be developed to form the basis of -the entire library of algorithms. - -\section{What is a Multiple Precision Integer?} -Recall that most programming languages (\textit{in particular C}) only have fixed precision data types that on their own cannot be used -to represent values larger than their precision alone will allow. The purpose of multiple precision algorithms is to use these fixed precision -data types to create multiple precision integers which may represent values that are much larger. - -As a well known analogy, school children are taught how to form numbers larger than nine by prepending more radix ten digits. In the decimal system -the largest value is only $9$ since the digits may only have values from $0$ to $9$. However, by concatenating digits together larger numbers -may be represented. Computer based multiple precision arithmetic is essentially the same concept except with a different radix. - -What most people probably do not think about explicitly are the various other attributes that describe a multiple precision integer. For example, -the integer $154_{10}$ has two immediately obvious properties. First, the integer is positive, that is the sign of this particular integer -is positive as oppose to negative. Second, the integer has three digits in its representation. There is an additional property that the integer -posesses that does not concern pencil-and-paper arithmetic. The third property is how many digits are allowed for the integer. - -The human analogy of this third property is ensuring there is enough space on the paper to right the integer. Computers must maintain a -strict control on memory usage with respect to the digits of a multiple precision integer. These three properties make up what is known -as a multiple precision integer or mp\_int for short. - -\subsection{The mp\_int structure} -The mp\_int structure is the ISO C based manifestation of what represents a multiple precision integer. The ISO C standard does not provide for -any such data type but it does provide for making composite data types known as structures. The following is the structure definition -used within LibTomMath. - -\index{mp\_int} -\begin{verbatim} -typedef struct { - int used, alloc, sign; - mp_digit *dp; -} mp_int; -\end{verbatim} - -The mp\_int structure can be broken down as follows. - -\begin{enumerate} -\item The \textbf{used} parameter denotes how many digits of the array \textbf{dp} contain the digits used to represent -a given integer. The \textbf{used} count must not exceed the \textbf{alloc} count. - -\item The array \textbf{dp} holds the digits that represent the given integer. It is padded with $\textbf{alloc} - \textbf{used}$ zero -digits. - -\item The \textbf{alloc} parameter denotes how -many digits are available in the array to use by functions before it has to increase in size. When the \textbf{used} count -of a result would exceed the \textbf{alloc} count all of the algorithms will automatically increase the size of the -array to accommodate the precision of the result. - -\item The \textbf{sign} parameter denotes the sign as either zero/positive (\textbf{MP\_ZPOS}) or negative (\textbf{MP\_NEG}). -\end{enumerate} - -\section{Argument Passing} -A convention of argument passing must be adopted early on in the development of any library. Making the function prototypes -consistent will help eliminate many headaches in the future as the library grows to significant complexity. In LibTomMath the multiple precision -integer functions accept parameters from left to right as pointers to mp\_int structures. That means that the source operands are -placed on the left and the destination on the right. Consider the following examples. - -\begin{verbatim} - mp_mul(&a, &b, &c); /* c = a * b */ - mp_add(&a, &b, &a); /* a = a + b */ - mp_sqr(&a, &b); /* b = a * a */ -\end{verbatim} - -The left to right order is a fairly natural way to implement the functions since it lets the developer read aloud the -functions and make sense of them. For example, the first function would read ``multiply a and b and store in c''. - -Certain libraries (\textit{LIP by Lenstra for instance}) accept parameters the other way around. That is the destination -on the left and arguments on the right. In truth it is entirely a matter of preference. In the case of LibTomMath the -convention from the MPI library has been adopted. - -Another very useful design consideration is whether to allow argument sources to also be a destination. For example, the -second example (\textit{mp\_add}) adds $a$ to $b$ and stores in $a$. This is an important feature to implement since it -allows the higher up functions to cut down on the number of variables. However, to implement this feature specific -care has to be given to ensure the destination is not modified before the source is fully read. - -\section{Return Values} -A well implemented library, no matter what its purpose, should trap as many runtime errors as possible and return them to the -caller. By catching runtime errors a library can be guaranteed to prevent undefined behaviour. In a multiple precision -library the only errors that can occur occur are related to inappropriate inputs (\textit{division by zero for instance}) or -memory allocation errors. - -In LibTomMath any function that can cause a runtime error will return an error as an \textbf{int} data type with one of the -following values. - -\index{MP\_OKAY} \index{MP\_VAL} \index{MP\_MEM} -\begin{center} -\begin{tabular}{|l|l|} -\hline \textbf{Value} & \textbf{Meaning} \\ -\hline \textbf{MP\_OKAY} & The function was successful \\ -\hline \textbf{MP\_VAL} & One of the input value(s) was invalid \\ -\hline \textbf{MP\_MEM} & The function ran out of heap memory \\ -\hline -\end{tabular} -\end{center} - -When an error is detected within a function it should free any memory it allocated and return as soon as possible. The goal -is to leave the system in the same state the system was when the function was called. Error checking with this style of API is fairly simple. - -\begin{verbatim} - int err; - if ((err = mp_add(&a, &b, &c)) != MP_OKAY) { - printf("Error: %d\n", err); - exit(EXIT_FAILURE); - } -\end{verbatim} - -The GMP library uses C style \textit{signals} to flag errors which is of questionable use. Not all errors are fatal -and it was not deemed ideal by the author of LibTomMath to force developers to have signal handlers for such cases. - -\section{Initialization and Clearing} -The logical starting point when actually writing multiple precision integer functions is the initialization and -clearing of the integers. These two functions will be used by far the most throughout the algorithms whenever -temporary integers are required. - -Given the basic mp\_int structure an initialization routine must first allocate memory to hold the digits of -the integer. Often it is optimal to allocate a sufficiently large pre-set number of digits even considering -the initial integer will represent zero. If only a single digit were allocated quite a few re-allocations -would occur for the majority of inputs. There is a tradeoff between how many default digits to allocate -and how many re-allocations are tolerable. - -If the memory for the digits has been successfully allocated then the rest of the members of the structure must -be initialized. Since the initial state is to represent a zero integer the digits allocated must all be zeroed. The -\textbf{used} count set to zero and \textbf{sign} set to \textbf{MP\_ZPOS}. - -\subsection{Initializing an mp\_int} -To initialize an mp\_int the mp\_init algorithm shall be used. The purpose of this algorithm is to allocate -the memory required and initialize the integer to a default representation of zero. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Allocate memory for the digits and set to a zero state. \\ -\hline \\ -1. Allocate memory for \textbf{MP\_PREC} digits. \\ -2. If the allocation failed then return(\textit{MP\_MEM}) \\ -3. for $n$ from $0$ to $MP\_PREC - 1$ do \\ -\hspace{3mm}3.1 $a_n \leftarrow 0$\\ -4. $a.sign \leftarrow MP\_ZPOS$\\ -5. $a.used \leftarrow 0$\\ -6. $a.alloc \leftarrow MP\_PREC$\\ -7. Return(\textit{MP\_OKAY})\\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init} -\end{figure} - -\textbf{Algorithm mp\_init.} -The \textbf{MP\_PREC} variable is a simple constant used to dictate minimal precision of allocated integers. It is ideally at least equal to $32$ but -can be any reasonable power of two. Steps one and two allocate the memory and account for it. If the allocation fails the algorithm returns -immediately to signal the failure. Step three will ensure that all the digits are in the default state of zero. Finally steps -four through six set the default settings of the \textbf{sign}, \textbf{used} and \textbf{alloc} members of the mp\_int structure. - -\index{bn\_mp\_init.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_init.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* init a new bigint */ -018 int -019 mp_init (mp_int * a) -020 \{ -021 /* allocate ram required and clear it */ -022 a->dp = OPT_CAST calloc (sizeof (mp_digit), MP_PREC); -023 if (a->dp == NULL) \{ -024 return MP_MEM; -025 \} -026 -027 /* set the used to zero, allocated digits to the default precision -028 * and sign to positive */ -029 a->used = 0; -030 a->alloc = MP_PREC; -031 a->sign = MP_ZPOS; -032 -033 return MP_OKAY; -034 \} -\end{alltt} -\end{small} - -The \textbf{OPT\_CAST} type cast on line 22 is designed to allow C++ compilers to build the code out of -the box. Microsoft C V5.00 is known to cause problems without the cast. Also note that if the memory -allocation fails the other members of the mp\_int will be in an undefined state. The code from -line 29 to line 31 sets the default state for a mp\_int which is zero, positive and no used digits. - -\subsection{Clearing an mp\_int} -When an mp\_int is no longer required the memory allocated for it can be cleared from the heap with -the mp\_clear algorithm. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_clear}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. The memory for $a$ is cleared. \\ -\hline \\ -1. If $a$ has been previously freed then return(\textit{MP\_OKAY}). \\ -2. Free the digits of $a$ and mark $a$ as freed. \\ -3. $a.used \leftarrow 0$ \\ -4. $a.alloc \leftarrow 0$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_clear} -\end{figure} - -\textbf{Algorithm mp\_clear.} -In steps one and two the memory for the digits are only free'd if they had not been previously released before. -This is more of concern for the implementation since it is used to prevent ``double-free'' errors. It also helps catch -code errors where mp\_ints are used after being cleared. Similarly steps three and four set the -\textbf{used} and \textbf{alloc} to known values which would be easy to spot during debugging. For example, if an mp\_int is expected -to be non-zero and its \textbf{used} member is observed to be zero (\textit{due to being cleared}) then an obvious bug in the code has been -spotted. - -\index{bn\_mp\_clear.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_clear.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* clear one (frees) */ -018 void -019 mp_clear (mp_int * a) -020 \{ -021 if (a->dp != NULL) \{ -022 -023 /* first zero the digits */ -024 memset (a->dp, 0, sizeof (mp_digit) * a->used); -025 -026 /* free ram */ -027 free (a->dp); -028 -029 /* reset members to make debugging easier */ -030 a->dp = NULL; -031 a->alloc = a->used = 0; -032 \} -033 \} -\end{alltt} -\end{small} - -The \textbf{if} statement on line 21 prevents the heap from being corrupted if a user double-frees an -mp\_int. For example, a trivial case of this bug would be as follows. - -\begin{verbatim} -mp_int a; -mp_init(&a); -mp_clear(&a); -mp_clear(&a); -\end{verbatim} - -Without that check the code would try to free the memory allocated for the digits twice which will cause most standard C -libraries to cause a fault. Also by setting the pointer to \textbf{NULL} it helps debug code that may inadvertently -free the mp\_int before it is truly not needed. The allocated digits are set to zero before being freed on line 24. -This is ideal for cryptographic situations where the mp\_int is a secret parameter. - -The following snippet is an example of using both the init and clear functions. - -\begin{small} -\begin{verbatim} -#include -#include -#include -int main(void) -{ - mp_int num; - int err; - - /* init the bignum */ - if ((err = mp_init(&num)) != MP_OKAY) { - printf("Error: %d\n", err); - return EXIT_FAILURE; - } - - /* do work with it ... */ - - /* clear up */ - mp_clear(&num); - - return EXIT_SUCCESS; -} -\end{verbatim} -\end{small} - -\section{Other Initialization Routines} - -It is often helpful to have specialized initialization algorithms to simplify the design of other algorithms. For example, an -initialization followed by a copy is a common operation when temporary copies of integers are required. It is quite -beneficial to have a series of simple helper functions available. - -\subsection{Initializing Variable Sized mp\_int Structures} -Occasionally the number of digits required will be known in advance of an initialization. In these -cases the mp\_init\_size algorithm can be of use. The purpose of this algorithm is similar to mp\_init except that -it will allocate \textit{at least} a specified number of digits. This is ideal to prevent re-allocations when the -input size is known. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_size}. \\ -\textbf{Input}. An mp\_int $a$ and the requested number of digits $b$\\ -\textbf{Output}. $a$ is initialized to hold at least $b$ digits. \\ -\hline \\ -1. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ -2. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ -3. Allocate $v$ digits. \\ -4. If the allocation failed then return(\textit{MP\_MEM}). \\ -5. for $n$ from $0$ to $v - 1$ do \\ -\hspace{3mm}5.1 $a_n \leftarrow 0$ \\ -6. $a.sign \leftarrow MP\_ZPOS$\\ -7. $a.used \leftarrow 0$\\ -8. $a.alloc \leftarrow v$\\ -9. Return(\textit{MP\_OKAY})\\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_size} -\end{figure} - -\textbf{Algorithm mp\_init\_size.} -The value of $v$ is calculated to be at least the requested amount of digits $b$ plus additional padding. The padding is calculated -to be at least \textbf{MP\_PREC} digits plus enough digits to make the digit count a multiple of \textbf{MP\_PREC}. This padding is used to -prevent trivial allocations from becoming a bottleneck in the rest of the algorithms that depend on this. - -\index{bn\_mp\_init\_size.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_size.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* init a mp_init and grow it to a given size */ -018 int -019 mp_init_size (mp_int * a, int size) -020 \{ -021 -022 /* pad size so there are always extra digits */ -023 size += (MP_PREC * 2) - (size & (MP_PREC - 1)); -024 -025 /* alloc mem */ -026 a->dp = OPT_CAST calloc (sizeof (mp_digit), size); -027 if (a->dp == NULL) \{ -028 return MP_MEM; -029 \} -030 a->used = 0; -031 a->alloc = size; -032 a->sign = MP_ZPOS; -033 -034 return MP_OKAY; -035 \} -\end{alltt} -\end{small} - -Line 23 will ensure that the number of digits actually allocated is padded up to the next multiple of -\textbf{MP\_PREC} plus an additional \textbf{MP\_PREC}. This ensures that the number of allocated digit is -always greater than the amount requested. As a result it prevents many trivial memory allocations. The value of -\textbf{MP\_PREC} is defined in ``tommath.h'' and must be a power of two. - -\subsection{Creating a Clone} -Another common sequence of operations is to make a local temporary copy of an argument. To initialize then copy a mp\_int will be known as -creating a clone. This is useful within functions that need to modify an integer argument but do not wish to actually modify the original copy. -The mp\_init\_copy algorithm will perform this very task. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_copy}. \\ -\textbf{Input}. An mp\_int $a$ and $b$\\ -\textbf{Output}. $a$ is initialized to be a copy of $b$. \\ -\hline \\ -1. Init $a$. (\textit{mp\_init}) \\ -2. If the init of $a$ was unsuccessful return(\textit{MP\_MEM}) \\ -3. Copy $b$ to $a$. (\textit{mp\_copy}) \\ -4. Return the status of the copy operation. \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_copy} -\end{figure} - -\textbf{Algorithm mp\_init\_copy.} -This algorithm will initialize a mp\_int variable and copy another previously initialized mp\_int variable into it. The algorithm will -detect when the initialization fails and returns the error to the calling algorithm. As such this algorithm will perform two operations -in one step. - -\index{bn\_mp\_init\_copy.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_init\_copy.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* creates "a" then copies b into it */ -018 int -019 mp_init_copy (mp_int * a, mp_int * b) -020 \{ -021 int res; -022 -023 if ((res = mp_init (a)) != MP_OKAY) \{ -024 return res; -025 \} -026 return mp_copy (b, a); -027 \} -\end{alltt} -\end{small} - -This will initialize \textbf{a} and make it a verbatim copy of the contents of \textbf{b}. Note that -\textbf{a} will have its own memory allocated which means that \textbf{b} may be cleared after the call -and \textbf{a} will be left intact. - -\subsection{Multiple Integer Initializations And Clearings} -Occasionally a function will require a series of mp\_int data types to be made available. The mp\_init\_multi algorithm -is provided to simplify such cases. The purpose of this algorithm is to initialize a variable length array of mp\_int -structures at once. As a result algorithms that require multiple integers only has to use -one algorithm to initialize all the mp\_int variables. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_init\_multi}. \\ -\textbf{Input}. Variable length array of mp\_int variables of length $k$. \\ -\textbf{Output}. The array is initialized such that each each mp\_int is ready to use. \\ -\hline \\ -1. for $n$ from 0 to $k - 1$ do \\ -\hspace{+3mm}1.1. Initialize the $n$'th mp\_int (\textit{mp\_init}) \\ -\hspace{+3mm}1.2. If initialization failed then do \\ -\hspace{+6mm}1.2.1. for $j$ from $0$ to $n$ do \\ -\hspace{+9mm}1.2.1.1. Free the $j$'th mp\_int (\textit{mp\_clear}) \\ -\hspace{+6mm}1.2.2. Return(\textit{MP\_MEM}) \\ -2. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_init\_multi} -\end{figure} - -\textbf{Algorithm mp\_init\_multi.} -The algorithm will initialize the array of mp\_int variables one at a time. As soon as an runtime error is detected (\textit{step 1.2}) all of -the previously initialized variables are cleared. The goal is an ``all or nothing'' initialization which allows for quick recovery from runtime -errors. - -Similarly to clear a variable length array of mp\_int structures the mp\_clear\_multi algorithm will be used. - -Consider the following snippet which demonstrates how to use both routines. -\begin{small} -\begin{verbatim} -#include -#include -#include -int main(void) -{ - mp_int num1, num2, num3; - int err; - - if ((err = mp_init_multi(&num1, &num2, &num3, NULL)) !- MP_OKAY) { - printf("Error: %d\n", err); - return EXIT_FAILURE; - } - - /* at this point num1/num2/num3 are ready */ - - /* free them */ - mp_clear_multi(&num1, &num2, &num3, NULL); - - return EXIT_SUCCESS; -} -\end{verbatim} -\end{small} - -Note how both lists are terminated with the \textbf{NULL} variable. This indicates to the algorithms to stop fetching parameters off -of the stack. If it is not present the functions will most likely cause a segmentation fault. - -\index{bn\_mp\_multi.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_multi.c -\vspace{-3mm} -\begin{alltt} -016 #include -017 -018 int mp_init_multi(mp_int *mp, ...) -019 \{ -020 mp_err res = MP_OKAY; /* Assume ok until proven otherwise */ -021 int n = 0; /* Number of ok inits */ -022 mp_int* cur_arg = mp; -023 va_list args; -024 -025 va_start(args, mp); /* init args to next argument from caller */ -026 while (cur_arg != NULL) \{ -027 if (mp_init(cur_arg) != MP_OKAY) \{ -028 /* Oops - error! Back-track and mp_clear what we already -029 succeeded in init-ing, then return error. -030 */ -031 va_list clean_args; -032 -033 /* end the current list */ -034 va_end(args); -035 -036 /* now start cleaning up */ -037 cur_arg = mp; -038 va_start(clean_args, mp); -039 while (n--) \{ -040 mp_clear(cur_arg); -041 cur_arg = va_arg(clean_args, mp_int*); -042 \} -043 va_end(clean_args); -044 res = MP_MEM; -045 break; -046 \} -047 n++; -048 cur_arg = va_arg(args, mp_int*); -049 \} -050 va_end(args); -051 return res; /* Assumed ok, if error flagged above. */ -052 \} -053 -054 void mp_clear_multi(mp_int *mp, ...) -055 \{ -056 mp_int* next_mp = mp; -057 va_list args; -058 va_start(args, mp); -059 while (next_mp != NULL) \{ -060 mp_clear(next_mp); -061 next_mp = va_arg(args, mp_int*); -062 \} -063 va_end(args); -064 \} -\end{alltt} -\end{small} - -Both routines are implemented in the same source file since they are typically used in conjunction with each other. - -\section{Maintenance} -A small useful collection of mp\_int maintenance functions will also prove useful. - -\subsection{Augmenting Integer Precision} -When storing a value in an mp\_int sufficient digits must be available to accomodate the entire value without -loss of precision. Quite often the size of the array given by the \textbf{alloc} member is large enough to simply -increase the \textbf{used} digit count. However, when the size of the array is too small it must be re-sized -appropriately to accomodate the result. The mp\_grow algorithm will provide this functionality. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_grow}. \\ -\textbf{Input}. An mp\_int $a$ and an integer $b$. \\ -\textbf{Output}. $a$ is expanded to accomodate $b$ digits. \\ -\hline \\ -1. if $a.alloc \ge b$ then return(\textit{MP\_OKAY}) \\ -2. $u \leftarrow b\mbox{ (mod }MP\_PREC\mbox{)}$ \\ -3. $v \leftarrow b + 2 \cdot MP\_PREC - u$ \\ -4. Re-Allocate the array of digits $a$ to size $v$ \\ -5. If the allocation failed then return(\textit{MP\_MEM}). \\ -6. for n from a.alloc to $v - 1$ do \\ -\hspace{+3mm}6.1 $a_n \leftarrow 0$ \\ -7. $a.alloc \leftarrow v$ \\ -8. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_grow} -\end{figure} - -\textbf{Algorithm mp\_grow.} -Step one will prevent a re-allocation from being performed if it was not required. This is useful to prevent mp\_ints -from growing excessively in code that erroneously calls mp\_grow. Similar to mp\_init\_size the requested digit count -is padded to provide more digits than requested. - -In step four it is assumed that the reallocation leaves the lower $a.alloc$ digits intact. This is much akin to how the -\textit{realloc} function from the standard C library works. Since the newly allocated digits are assumed to contain -undefined values they are also initially zeroed. - -\index{bn\_mp\_grow.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_grow.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* grow as required */ -018 int -019 mp_grow (mp_int * a, int size) -020 \{ -021 int i; -022 -023 /* if the alloc size is smaller alloc more ram */ -024 if (a->alloc < size) \{ -025 /* ensure there are always at least MP_PREC digits extra on top */ -026 size += (MP_PREC * 2) - (size & (MP_PREC - 1)); -027 -028 a->dp = OPT_CAST realloc (a->dp, sizeof (mp_digit) * size); -029 if (a->dp == NULL) \{ -030 return MP_MEM; -031 \} -032 -033 /* zero excess digits */ -034 i = a->alloc; -035 a->alloc = size; -036 for (; i < a->alloc; i++) \{ -037 a->dp[i] = 0; -038 \} -039 \} -040 return MP_OKAY; -041 \} -\end{alltt} -\end{small} - -The first step is to see if we actually need to perform a re-allocation at all. This is tested for on line -24. Similar to mp\_init\_size the same code on line 26 was used to resize the -digits requested. A simple for loop from line 34 to line 38 will zero all digits that were above the -old \textbf{alloc} limit to make sure the integer is in a known state. - -\subsection{Clamping Excess Digits} -When a function anticipates a result will be $n$ digits it is simpler to assume this is true within the body of -the function. For example, a multiplication of a $i$ digit number by a $j$ digit produces a result of at most -$i + j$ digits. It is entirely possible that the result is $i + j - 1$ though, with no final carry into the last -position. However, suppose the destination had to be first expanded (\textit{via mp\_grow}) to accomodate $i + j - 1$ -digits than further expanded to accomodate the final carry. That would be a considerable waste of time since heap -operations are relatively slow. - -The ideal solution is to always assume the result is $i + j$ and fix up the \textbf{used} count after the function -terminates. This way a single heap operation (\textit{at most}) is required. However, if the result was not checked -there would be an excess high order zero digit. - -For example, suppose the product of two integers was $x_n = (0x_{n-1}x_{n-2}...x_0)_{\beta}$. The leading zero digit -will not contribute to the precision of the result. In fact, through subsequent operations more leading zero digits would -accumulate to the point the size of the integer would be prohibitive. As a result even though the precision is very -low the representation is excessively large. - -The mp\_clamp algorithm is designed to solve this very problem. It will trim leading zeros by decrementing the -\textbf{used} count until a non-zero leading digit is found. Also in this system, zero is considered to be a positive -number which means that if the \textbf{used} count is decremented to zero the sign must be set to \textbf{MP\_ZPOS}. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_clamp}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Any excess leading zero digits of $a$ are removed \\ -\hline \\ -1. while $a.used > 0$ and $a_{a.used - 1} = 0$ do \\ -\hspace{+3mm}1.1 $a.used \leftarrow a.used - 1$ \\ -2. if $a.used = 0$ then do \\ -\hspace{+3mm}2.1 $a.sign \leftarrow MP\_ZPOS$ \\ -\hline \\ -\end{tabular} -\end{center} -\caption{Algorithm mp\_clamp} -\end{figure} - -\textbf{Algorithm mp\_clamp.} -As can be expected this algorithm is very simple. The loop on step one is expected to iterate only once or twice at -the most. For example, this will happen in cases where there is not a carry to fill the last position. Step two fixes the sign for -when all of the digits are zero to ensure that the mp\_int is valid at all times. - -\index{bn\_mp\_clamp.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_clamp.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* trim unused digits -018 * -019 * This is used to ensure that leading zero digits are -020 * trimed and the leading "used" digit will be non-zero -021 * Typically very fast. Also fixes the sign if there -022 * are no more leading digits -023 */ -024 void -025 mp_clamp (mp_int * a) -026 \{ -027 while (a->used > 0 && a->dp[a->used - 1] == 0) \{ -028 --(a->used); -029 \} -030 if (a->used == 0) \{ -031 a->sign = MP_ZPOS; -032 \} -033 \} -\end{alltt} -\end{small} - -Note on line 27 how to test for the \textbf{used} count is made on the left of the \&\& operator. In the C programming -language the terms to \&\& are evaluated left to right with a boolean short-circuit if any condition fails. This is -important since if the \textbf{used} is zero the test on the right would fetch below the array. That is obviously -undesirable. The parenthesis on line 28 is used to make sure the \textbf{used} count is decremented and not -the pointer ``a''. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 1 \right ]$ & Discuss the relevance of the \textbf{used} member of the mp\_int structure. \\ - & \\ -$\left [ 1 \right ]$ & Discuss the consequences of not using padding when performing allocations. \\ - & \\ -$\left [ 2 \right ]$ & Estimate an ideal value for \textbf{MP\_PREC} when performing 1024-bit RSA \\ - & encryption when $\beta = 2^{28}$. \\ - & \\ -$\left [ 1 \right ]$ & Discuss the relevance of the algorithm mp\_clamp. What does it prevent? \\ - & \\ -$\left [ 1 \right ]$ & Give an example of when the algorithm mp\_init\_copy might be useful. \\ - & \\ -\end{tabular} - - -\chapter{Basic Operations} -\section{Copying an Integer} -After the various house-keeping routines are in place, simple algorithms can be designed to take advantage of them. Being able -to make a verbatim copy of an integer is a very useful function to have. To copy an integer the mp\_copy algorithm will be used. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_copy}. \\ -\textbf{Input}. An mp\_int $a$ and $b$. \\ -\textbf{Output}. Store a copy of $a$ in $b$. \\ -\hline \\ -1. Check if $a$ and $b$ point to the same location in memory. \\ -2. If true then return(\textit{MP\_OKAY}). \\ -3. If $b.alloc < a.used$ then grow $b$ to $a.used$ digits. (\textit{mp\_grow}) \\ -4. If failed to grow then return(\textit{MP\_MEM}). \\ -5. for $n$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}5.1 $b_{n} \leftarrow a_{n}$ \\ -6. if $a.used < b.used - 1$ then \\ -\hspace{3mm}6.1. for $n$ from $a.used$ to $b.used - 1$ do \\ -\hspace{6mm}6.1.1 $b_{n} \leftarrow 0$ \\ -7. $b.used \leftarrow a.used$ \\ -8. $b.sign \leftarrow a.sign$ \\ -9. return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_copy} -\end{figure} - -\textbf{Algorithm mp\_copy.} -Step 1 and 2 make sure that the two mp\_ints are unique. This allows the user to call the copy function with -potentially the same input and not waste time. Step 3 and 4 ensure that the destination is large enough to -hold a copy of the input $a$. Note that the \textbf{used} member of $b$ may be smaller than the \textbf{used} -member of $a$ but a memory re-allocation is only required if the \textbf{alloc} member of $b$ is smaller. This -prevents trivial memory reallocations. - -Step 5 copies the digits from $a$ to $b$ while step 6 ensures that if initially $\vert b \vert > \vert a \vert$, -the more significant digits of $b$ will be zeroed. Finally steps 7 and 8 copies the \textbf{used} and \textbf{sign} members over -which completes the copy operation. - -\index{bn\_mp\_copy.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_copy.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* copy, b = a */ -018 int -019 mp_copy (mp_int * a, mp_int * b) -020 \{ -021 int res, n; -022 -023 /* if dst == src do nothing */ -024 if (a == b) \{ -025 return MP_OKAY; -026 \} -027 -028 /* grow dest */ -029 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ -030 return res; -031 \} -032 -033 /* zero b and copy the parameters over */ -034 \{ -035 register mp_digit *tmpa, *tmpb; -036 -037 /* pointer aliases */ -038 tmpa = a->dp; -039 tmpb = b->dp; -040 -041 /* copy all the digits */ -042 for (n = 0; n < a->used; n++) \{ -043 *tmpb++ = *tmpa++; -044 \} -045 -046 /* clear high digits */ -047 for (; n < b->used; n++) \{ -048 *tmpb++ = 0; -049 \} -050 \} -051 b->used = a->used; -052 b->sign = a->sign; -053 return MP_OKAY; -054 \} -\end{alltt} -\end{small} - -Source lines 23-31 do the initial house keeping. That is to see if the input is unique and if so to -make sure there is enough room. If not enough space is available it returns the error and leaves the destination variable -intact. - -The inner loop of the copy operation is contained between lines 34 and 50. Many LibTomMath routines are designed with this source code style -in mind, making aliases to shorten lengthy pointers (\textit{see line 38 and 39}) for rapid use. Also the -use of nested braces creates a simple way to denote various portions of code that reside on various work levels. Here, the copy loop is at the -$O(n)$ level. - -\section{Zeroing an Integer} -Reseting an mp\_int to the default state is a common step in many algorithms. The mp\_zero algorithm will be the algorithm used to -perform this task. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_zero}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Zero the contents of $a$ \\ -\hline \\ -1. $a.used \leftarrow 0$ \\ -2. $a.sign \leftarrow$ MP\_ZPOS \\ -3. for $n$ from 0 to $a.alloc - 1$ do \\ -\hspace{3mm}3.1 $a_n \leftarrow 0$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_zero} -\end{figure} - -\textbf{Algorithm mp\_zero.} -This algorithm simply resets a mp\_int to the default state. - -\index{bn\_mp\_zero.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_zero.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* set to zero */ -018 void -019 mp_zero (mp_int * a) -020 \{ -021 a->sign = MP_ZPOS; -022 a->used = 0; -023 memset (a->dp, 0, sizeof (mp_digit) * a->alloc); -024 \} -\end{alltt} -\end{small} - -After the function is completed, all of the digits are zeroed, the \textbf{used} count is zeroed and the -\textbf{sign} variable is set to \textbf{MP\_ZPOS}. - -\section{Sign Manipulation} -\subsection{Absolute Value} -With the mp\_int representation of an integer, calculating the absolute value is trivial. The mp\_abs algorithm will compute -the absolute value of an mp\_int. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_abs}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Computes $b = \vert a \vert$ \\ -\hline \\ -1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ -2. If the copy failed return(\textit{MP\_MEM}). \\ -3. $b.sign \leftarrow MP\_ZPOS$ \\ -4. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_abs} -\end{figure} - -\textbf{Algorithm mp\_abs.} -This algorithm computes the absolute of an mp\_int input. As can be expected the algorithm is very trivial. - -\index{bn\_mp\_abs.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_abs.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* b = |a| -018 * -019 * Simple function copies the input and fixes the sign to positive -020 */ -021 int -022 mp_abs (mp_int * a, mp_int * b) -023 \{ -024 int res; -025 if ((res = mp_copy (a, b)) != MP_OKAY) \{ -026 return res; -027 \} -028 b->sign = MP_ZPOS; -029 return MP_OKAY; -030 \} -\end{alltt} -\end{small} - -\subsection{Integer Negation} -With the mp\_int representation of an integer, calculating the negation is also trivial. The mp\_neg algorithm will compute -the negative of an mp\_int input. - -\newpage\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_neg}. \\ -\textbf{Input}. An mp\_int $a$ \\ -\textbf{Output}. Computes $b = -a$ \\ -\hline \\ -1. Copy $a$ to $b$. (\textit{mp\_copy}) \\ -2. If the copy failed return(\textit{MP\_MEM}). \\ -3. If $a.sign = MP\_ZPOS$ then do \\ -\hspace{3mm}3.1 $b.sign = MP\_NEG$. \\ -4. else do \\ -\hspace{3mm}4.1 $b.sign = MP\_ZPOS$. \\ -5. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_neg} -\end{figure} - -\textbf{Algorithm mp\_neg.} -This algorithm computes the negation of an input. - -\index{bn\_mp\_neg.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_neg.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* b = -a */ -018 int -019 mp_neg (mp_int * a, mp_int * b) -020 \{ -021 int res; -022 if ((res = mp_copy (a, b)) != MP_OKAY) \{ -023 return res; -024 \} -025 b->sign = (a->sign == MP_ZPOS) ? MP_NEG : MP_ZPOS; -026 return MP_OKAY; -027 \} -\end{alltt} -\end{small} - -\section{Small Constants} -\subsection{Setting Small Constants} -Often a mp\_int must be set to a relatively small value such as $1$ or $2$. For these cases the mp\_set algorithm is useful. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_set}. \\ -\textbf{Input}. An mp\_int $a$ and a digit $b$ \\ -\textbf{Output}. Make $a$ equivalent to $b$ \\ -\hline \\ -1. Zero $a$ (\textit{mp\_zero}). \\ -2. $a_0 \leftarrow b \mbox{ (mod }\beta\mbox{)}$ \\ -3. $a.used \leftarrow \left \lbrace \begin{array}{ll} - 1 & \mbox{if }a_0 > 0 \\ - 0 & \mbox{if }a_0 = 0 - \end{array} \right .$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_set} -\end{figure} - -\textbf{Algorithm mp\_set.} -This algorithm sets a mp\_int to a small single digit value. Step number 1 ensures that the integer is reset to the default state. The -single digit is set (\textit{modulo $\beta$}) and the \textbf{used} count is adjusted accordingly. - -\index{bn\_mp\_set.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_set.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* set to a digit */ -018 void -019 mp_set (mp_int * a, mp_digit b) -020 \{ -021 mp_zero (a); -022 a->dp[0] = b & MP_MASK; -023 a->used = (a->dp[0] != 0) ? 1 : 0; -024 \} -\end{alltt} -\end{small} - -Line 21 calls mp\_zero() to clear the mp\_int and reset the sign. Line 22 copies the digit -into the least significant location. Note the usage of a new constant \textbf{MP\_MASK}. This constant is used to quickly -reduce an integer modulo $\beta$. Since $\beta$ is of the form $2^k$ for any suitable $k$ it suffices to perform a binary AND with -$MP\_MASK = 2^k - 1$ to perform the reduction. Finally line 23 will set the \textbf{used} member with respect to the -digit actually set. This function will always make the integer positive. - -One important limitation of this function is that it will only set one digit. The size of a digit is not fixed, meaning source that uses -this function should take that into account. Meaning that only trivially small constants can be set using this function. - -\subsection{Setting Large Constants} -To overcome the limitations of the mp\_set algorithm the mp\_set\_int algorithm is provided. It accepts a ``long'' -data type as input and will always treat it as a 32-bit integer. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_set\_int}. \\ -\textbf{Input}. An mp\_int $a$ and a ``long'' integer $b$ \\ -\textbf{Output}. Make $a$ equivalent to $b$ \\ -\hline \\ -1. Zero $a$ (\textit{mp\_zero}) \\ -2. for $n$ from 0 to 7 do \\ -\hspace{3mm}2.1 $a \leftarrow a \cdot 16$ (\textit{mp\_mul2d}) \\ -\hspace{3mm}2.2 $u \leftarrow \lfloor b / 2^{4(7 - n)} \rfloor \mbox{ (mod }16\mbox{)}$\\ -\hspace{3mm}2.3 $a_0 \leftarrow a_0 + u$ \\ -\hspace{3mm}2.4 $a.used \leftarrow a.used + 1$ \\ -3. Clamp excess used digits (\textit{mp\_clamp}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_set\_int} -\end{figure} - -\textbf{Algorithm mp\_set\_int.} -The algorithm performs eight iterations of a simple loop where in each iteration four bits from the source are added to the -mp\_int. Step 2.1 will multiply the current result by sixteen making room for four more bits in the less significant positions. In step 2.2 the -next four bits from the source are extracted and are added to the mp\_int. The \textbf{used} digit count is -incremented to reflect the addition. The \textbf{used} digit counter is incremented since if any of the leading digits were zero the mp\_int would have -zero digits used and the newly added four bits would be ignored. - -Excess zero digits are trimmed in steps 2.1 and 3 by using higher level algorithms mp\_mul2d and mp\_clamp. - -\index{bn\_mp\_set\_int.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_set\_int.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* set a 32-bit const */ -018 int -019 mp_set_int (mp_int * a, unsigned int b) -020 \{ -021 int x, res; -022 -023 mp_zero (a); -024 /* set four bits at a time */ -025 for (x = 0; x < 8; x++) \{ -026 /* shift the number up four bits */ -027 if ((res = mp_mul_2d (a, 4, a)) != MP_OKAY) \{ -028 return res; -029 \} -030 -031 /* OR in the top four bits of the source */ -032 a->dp[0] |= (b >> 28) & 15; -033 -034 /* shift the source up to the next four bits */ -035 b <<= 4; -036 -037 /* ensure that digits are not clamped off */ -038 a->used += 1; -039 \} -040 mp_clamp (a); -041 return MP_OKAY; -042 \} -\end{alltt} -\end{small} - -This function sets four bits of the number at a time to handle all practical \textbf{DIGIT\_BIT} sizes. The weird -addition on line 38 ensures that the newly added in bits are added to the number of digits. While it may not -seem obvious as to why the digit counter does not grow exceedingly large it is because of the shift on line 27 -as well as the call to mp\_clamp() on line 40. Both functions will clamp excess leading digits which keeps -the number of used digits low. - -\section{Comparisons} -\subsection{Unsigned Comparisions} -Comparing a multiple precision integer is performed with the exact same algorithm used to compare two decimal numbers. For example, -to compare $1,234$ to $1,264$ the digits are extracted by their positions. That is we compare $1 \cdot 10^3 + 2 \cdot 10^2 + 3 \cdot 10^1 + 4 \cdot 10^0$ -to $1 \cdot 10^3 + 2 \cdot 10^2 + 6 \cdot 10^1 + 4 \cdot 10^0$ by comparing single digits at a time starting with the highest magnitude -positions. If any leading digit of one integer is greater than a digit in the same position of another integer then obviously it must be greater. - -The first comparision routine that will be developed is the unsigned magnitude compare which will perform a comparison based on the digits of two -mp\_int variables alone. It will ignore the sign of the two inputs. Such a function is useful when an absolute comparison is required or if the -signs are known to agree in advance. - -To facilitate working with the results of the comparison functions three constants are required. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{|r|l|} -\hline \textbf{Constant} & \textbf{Meaning} \\ -\hline \textbf{MP\_GT} & Greater Than \\ -\hline \textbf{MP\_EQ} & Equal To \\ -\hline \textbf{MP\_LT} & Less Than \\ -\hline -\end{tabular} -\end{center} -\caption{Comparison Return Codes} -\end{figure} - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_cmp\_mag}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$. \\ -\textbf{Output}. Unsigned comparison results ($a$ to the left of $b$). \\ -\hline \\ -1. If $a.used > b.used$ then return(\textit{MP\_GT}) \\ -2. If $a.used < b.used$ then return(\textit{MP\_LT}) \\ -3. for n from $a.used - 1$ to 0 do \\ -\hspace{+3mm}3.1 if $a_n > b_n$ then return(\textit{MP\_GT}) \\ -\hspace{+3mm}3.2 if $a_n < b_n$ then return(\textit{MP\_LT}) \\ -4. Return(\textit{MP\_EQ}) \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_cmp\_mag} -\end{figure} - -\textbf{Algorithm mp\_cmp\_mag.} -By saying ``$a$ to the left of $b$'' it is meant that the comparison is with respect to $a$, that is if $a$ is greater than $b$ it will return -\textbf{MP\_GT} and similar with respect to when $a = b$ and $a < b$. The first two steps compare the number of digits used in both $a$ and $b$. -Obviously if the digit counts differ there would be an imaginary zero digit in the smaller number where the leading digit of the larger number is. -If both have the same number of digits than the actual digits themselves must be compared starting at the leading digit. - -By step three both inputs must have the same number of digits so its safe to start from either $a.used - 1$ or $b.used - 1$ and count down to -the zero'th digit. If after all of the digits have been compared, no difference is found, the algorithm returns \textbf{MP\_EQ}. - -\index{bn\_mp\_cmp\_mag.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp\_mag.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* compare maginitude of two ints (unsigned) */ -018 int -019 mp_cmp_mag (mp_int * a, mp_int * b) -020 \{ -021 int n; -022 -023 /* compare based on # of non-zero digits */ -024 if (a->used > b->used) \{ -025 return MP_GT; -026 \} -027 -028 if (a->used < b->used) \{ -029 return MP_LT; -030 \} -031 -032 /* compare based on digits */ -033 for (n = a->used - 1; n >= 0; n--) \{ -034 if (a->dp[n] > b->dp[n]) \{ -035 return MP_GT; -036 \} -037 -038 if (a->dp[n] < b->dp[n]) \{ -039 return MP_LT; -040 \} -041 \} -042 return MP_EQ; -043 \} -\end{alltt} -\end{small} - -The two if statements on lines 24 and 28 compare the number of digits in the two inputs. These two are performed before all of the digits -are compared since it is a very cheap test to perform and can potentially save considerable time. The implementation given is also not valid -without those two statements. $b.alloc$ may be smaller than $a.used$, meaning that undefined values will be read from $b$ past the end of the -array of digits. - -\subsection{Signed Comparisons} -Comparing with sign considerations is also fairly critical in several routines (\textit{division for example}). Based on an unsigned magnitude -comparison a trivial signed comparison algorithm can be written. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_cmp}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. Signed Comparison Results ($a$ to the left of $b$) \\ -\hline \\ -1. if $a.sign = MP\_NEG$ and $b.sign = MP\_ZPOS$ then return(\textit{MP\_LT}) \\ -2. if $a.sign = MP\_ZPOS$ and $b.sign = MP\_NEG$ then return(\textit{MP\_GT}) \\ -3. if $a.sign = MP\_NEG$ then \\ -\hspace{+3mm}3.1 Return the unsigned comparison of $b$ and $a$ (\textit{mp\_cmp\_mag}) \\ -4 Otherwise \\ -\hspace{+3mm}4.1 Return the unsigned comparison of $a$ and $b$ \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_cmp} -\end{figure} - -\textbf{Algorithm mp\_cmp.} -The first two steps compare the signs of the two inputs. If the signs do not agree then it can return right away with the appropriate -comparison code. When the signs are equal the digits of the inputs must be compared to determine the correct result. In step -three the unsigned comparision flips the order of the arguments since they are both negative. For instance, if $-a > -b$ then -$\vert a \vert < \vert b \vert$. Step number four will compare the two when they are both positive. - -\index{bn\_mp\_cmp.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_cmp.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* compare two ints (signed)*/ -018 int -019 mp_cmp (mp_int * a, mp_int * b) -020 \{ -021 /* compare based on sign */ -022 if (a->sign == MP_NEG && b->sign == MP_ZPOS) \{ -023 return MP_LT; -024 \} -025 -026 if (a->sign == MP_ZPOS && b->sign == MP_NEG) \{ -027 return MP_GT; -028 \} -029 -030 /* compare digits */ -031 if (a->sign == MP_NEG) \{ -032 /* if negative compare opposite direction */ -033 return mp_cmp_mag(b, a); -034 \} else \{ -035 return mp_cmp_mag(a, b); -036 \} -037 \} -\end{alltt} -\end{small} - -The two if statements on lines 22 and 26 perform the initial sign comparison. If the signs are not the equal then which ever -has the positive sign is larger. At line 31, the inputs are compared based on magnitudes. If the signs were both negative then -the unsigned comparison is performed in the opposite direction (\textit{line 33}). Otherwise, the signs are assumed to -be both positive and a forward direction unsigned comparison is performed. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 2 \right ]$ & Modify algorithm mp\_set\_int to accept as input a variable length array of bits. \\ - & \\ -$\left [ 3 \right ]$ & Give the probability that algorithm mp\_cmp\_mag will have to compare $k$ digits \\ - & of two random digits (of equal magnitude) before a difference is found. \\ - & \\ -$\left [ 1 \right ]$ & Suggest a simple method to speed up the implementation of mp\_cmp\_mag based \\ - & on the observations made in the previous problem. \\ - & -\end{tabular} - -\chapter{Basic Arithmetic} -\section{Building Blocks} -At this point algorithms for initialization, clearing, zeroing, copying, comparing and setting small constants have been -established. The next logical set of algorithms to develop are addition, subtraction and digit shifting algorithms. These -algorithms make use of the lower level algorithms and are the cruicial building block for the multiplication algorithms. It is very important -that these algorithms are highly optimized. On their own they are simple $O(n)$ algorithms but they can be called from higher level algorithms -which easily places them at $O(n^2)$ or even $O(n^3)$ work levels. - -All nine algorithms within this chapter make use of the logical bit shift operations denoted by $<<$ and $>>$ for left and right -logical shifts respectively. A logical shift is analogous to sliding the decimal point of radix-10 representations. For example, the real -number $0.9345$ is equivalent to $93.45\%$ which is found by sliding the the decimal two places to the right (\textit{multiplying by $10^2$}). -Mathematically a logical shift is equivalent to a division or multiplication by a power of two. -For example, $a << k = a \cdot 2^k$ while $a >> k = \lfloor a/2^k \rfloor$. - -One significant difference between a logical shift and the way decimals are shifted is that digits below the zero'th position are removed -from the number. For example, consider $1101_2 >> 1$ using decimal notation this would produce $110.1_2$. However, with a logical shift the -result is $110_2$. - -\section{Addition and Subtraction} -In normal fixed precision arithmetic negative numbers are easily represented by subtraction from the modulus. For example, with 32-bit integers -$a - b\mbox{ (mod }2^{32}\mbox{)}$ is the same as $a + (2^{32} - b) \mbox{ (mod }2^{32}\mbox{)}$ since $2^{32} \equiv 0 \mbox{ (mod }2^{32}\mbox{)}$. -As a result subtraction can be performed with a trivial series of logical operations and an addition. - -However, in multiple precision arithmetic negative numbers are not represented in the same way. Instead a sign flag is used to keep track of the -sign of the integer. As a result signed addition and subtraction are actually implemented as conditional usage of lower level addition or -subtraction algorithms with the sign fixed up appropriately. - -The lower level algorithms will add or subtract integers without regard to the sign flag. That is they will add or subtract the magnitude of -the integers respectively. - -\subsection{Low Level Addition} -An unsigned addition of multiple precision integers is performed with the same long-hand algorithm used to add decimal numbers. That is to add the -trailing digits first and propagate the resulting carry upwards. Since this is a lower level algorithm the name will have a ``s\_'' prefix. -Historically that convention stems from the MPI library where ``s\_'' stood for static functions that were hidden from the developer entirely. - -\newpage -\begin{figure}[!here] -\begin{center} -\begin{small} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_add}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The unsigned addition $c = \vert a \vert + \vert b \vert$. \\ -\hline \\ -1. if $a.used > b.used$ then \\ -\hspace{+3mm}1.1 $min \leftarrow b.used$ \\ -\hspace{+3mm}1.2 $max \leftarrow a.used$ \\ -\hspace{+3mm}1.3 $x \leftarrow a$ \\ -2. else \\ -\hspace{+3mm}2.1 $min \leftarrow a.used$ \\ -\hspace{+3mm}2.2 $max \leftarrow b.used$ \\ -\hspace{+3mm}2.3 $x \leftarrow b$ \\ -3. If $c.alloc < max + 1$ then grow $c$ to hold at least $max + 1$ digits (\textit{mp\_grow}) \\ -4. If failed to grow $c$ return(\textit{MP\_MEM}) \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow max + 1$ \\ -7. $u \leftarrow 0$ \\ -8. for $n$ from $0$ to $min - 1$ do \\ -\hspace{+3mm}8.1 $c_n \leftarrow a_n + b_n + u$ \\ -\hspace{+3mm}8.2 $u \leftarrow c_n >> lg(\beta)$ \\ -\hspace{+3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -9. if $min \ne max$ then do \\ -\hspace{+3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ -\hspace{+6mm}9.1.1 $c_n \leftarrow x_n + u$ \\ -\hspace{+6mm}9.1.2 $u \leftarrow c_n >> lg(\beta)$ \\ -\hspace{+6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -10. $c_{max} \leftarrow u$ \\ -11. if $olduse > max$ then \\ -\hspace{+3mm}11.1 for $n$ from $max + 1$ to $olduse - 1$ do \\ -\hspace{+6mm}11.1.1 $c_n \leftarrow 0$ \\ -12. Clamp excess digits in $c$. (\textit{mp\_clamp}) \\ -13. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Algorithm s\_mp\_add} -\end{figure} - -\textbf{Algorithm s\_mp\_add.} -This algorithm is loosely based on algorithm 14.7 of HAC \cite[pp. 594]{HAC} but has been extended to allow the inputs to have different magnitudes. -Coincidentally the description of algorithm A in Knuth \cite[pp. 266]{TAOCPV2} shares the same deficiency as the algorithm from \cite{HAC}. Even the -MIX pseudo machine code presented by Knuth \cite[pp. 266-267]{TAOCPV2} is incapable of handling inputs which are of different magnitudes. - -Steps 1 and 2 will sort the two inputs based on their \textbf{used} digit count. This allows the inputs to have varying magnitudes which not -only makes it more efficient than the trivial algorithm presented in the references but more flexible. The variable $min$ is given the lowest -digit count while $max$ is given the highest digit count. If both inputs have the same \textbf{used} digit count both $min$ and $max$ are -set to the same value. The variable $x$ is an \textit{alias} for the largest input and not meant to be a copy of it. After the inputs are sorted, -steps 3 and 4 will ensure that the destination $c$ can accommodate the result. The old \textbf{used} count from $c$ is copied to -$oldused$ so that excess digits can be cleared later, and the new \textbf{used} count is set to $max+1$, so that a carry from the most significant -word can be handled. - -At step 7 the carry variable $u$ is set to zero and the first part of the addition loop can begin. The first step of the loop (\textit{8.1}) adds -digits from the two inputs together along with the carry variable $u$. The following step extracts the carry bit by shifting the result of the -preceding step right by $lg(\beta)$ positions. The shift to extract the carry is similar to how carry extraction works with decimal addition. - -Consider adding $77$ to $65$, the first addition of the first column is $7 + 5$ which produces the result $12$. The trailing digit of the result -is $2 \equiv 12 \mbox{ (mod }10\mbox{)}$ and the carry is found by dividing (\textit{and ignoring the remainder}) $12$ by the radix or in this case $10$. The -division and multiplication of $10$ is simply a logical right or left shift, respectively, of the digits. In otherwords the carry can be extracted -by shifting one digit to the right. - -Note that $lg()$ is simply the base two logarithm such that $lg(2^k) = k$. This implies that $lg(\beta)$ is the number of bits in a radix-$\beta$ -digit. Therefore, a logical shift right of the summand by $lg(\beta)$ will extract the carry. The final step of the loop reduces the digit -modulo the radix $\beta$ to ensure it is in range. - -After step 8 the smallest input (\textit{or both if they are the same magnitude}) has been exhausted. Step 9 decides whether -the inputs were of equal magnitude. If not than another loop similar to that in step 8, must be executed. The loop at step -number 9.1 differs from the previous loop since it only adds the mp\_int $x$ along with the carry. - -Step 10 finishes the addition phase by copying the final carry to the highest location in the result $c_{max}$. Step 11 ensures that -leading digits that were originally present in $c$ are cleared. Finally excess leading digits are clamped and the algorithm returns success. - -\index{bn\_s\_mp\_add.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_add.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* low level addition, based on HAC pp.594, Algorithm 14.7 */ -018 int -019 s_mp_add (mp_int * a, mp_int * b, mp_int * c) -020 \{ -021 mp_int *x; -022 int olduse, res, min, max; -023 -024 /* find sizes, we let |a| <= |b| which means we have to sort -025 * them. "x" will point to the input with the most digits -026 */ -027 if (a->used > b->used) \{ -028 min = b->used; -029 max = a->used; -030 x = a; -031 \} else \{ -032 min = a->used; -033 max = b->used; -034 x = b; -035 \} -036 -037 /* init result */ -038 if (c->alloc < max + 1) \{ -039 if ((res = mp_grow (c, max + 1)) != MP_OKAY) \{ -040 return res; -041 \} -042 \} -043 -044 /* get old used digit count and set new one */ -045 olduse = c->used; -046 c->used = max + 1; -047 -048 \{ -049 register mp_digit u, *tmpa, *tmpb, *tmpc; -050 register int i; -051 -052 /* alias for digit pointers */ -053 -054 /* first input */ -055 tmpa = a->dp; -056 -057 /* second input */ -058 tmpb = b->dp; -059 -060 /* destination */ -061 tmpc = c->dp; -062 -063 /* zero the carry */ -064 u = 0; -065 for (i = 0; i < min; i++) \{ -066 /* Compute the sum at one digit, T[i] = A[i] + B[i] + U */ -067 *tmpc = *tmpa++ + *tmpb++ + u; -068 -069 /* U = carry bit of T[i] */ -070 u = *tmpc >> ((mp_digit)DIGIT_BIT); -071 -072 /* take away carry bit from T[i] */ -073 *tmpc++ &= MP_MASK; -074 \} -075 -076 /* now copy higher words if any, that is in A+B -077 * if A or B has more digits add those in -078 */ -079 if (min != max) \{ -080 for (; i < max; i++) \{ -081 /* T[i] = X[i] + U */ -082 *tmpc = x->dp[i] + u; -083 -084 /* U = carry bit of T[i] */ -085 u = *tmpc >> ((mp_digit)DIGIT_BIT); -086 -087 /* take away carry bit from T[i] */ -088 *tmpc++ &= MP_MASK; -089 \} -090 \} -091 -092 /* add carry */ -093 *tmpc++ = u; -094 -095 /* clear digits above oldused */ -096 for (i = c->used; i < olduse; i++) \{ -097 *tmpc++ = 0; -098 \} -099 \} -100 -101 mp_clamp (c); -102 return MP_OKAY; -103 \} -\end{alltt} -\end{small} - -Lines 27 to 35 perform the initial sorting of the inputs and determine the $min$ and $max$ variables. Note that $x$ is a pointer to a -mp\_int assigned to the largest input, in effect it is a local alias. Lines 37 to 42 ensure that the destination is grown to -accomodate the result of the addition. - -Similar to the implementation of mp\_copy this function uses the braced code and local aliases coding style. The three aliases that are on -lines 55, 58 and 61 represent the two inputs and destination variables respectively. These aliases are used to ensure the -compiler does not have to dereference $a$, $b$ or $c$ (respectively) to access the digits of the respective mp\_int. - -The initial carry $u$ is cleared on line 64, note that $u$ is of type mp\_digit which ensures type compatibility within the -implementation. The initial addition loop begins on line 65 and ends on line 74. Similarly the conditional addition loop -begins on line 80 and ends on line 90. The addition is finished with the final carry being stored in $tmpc$ on line 93. -Note the ``++'' operator on the same line. After line 93 $tmpc$ will point to the $c.used$'th digit of the mp\_int $c$. This is useful -for the next loop on lines 96 to 99 which set any old upper digits to zero. - -\subsection{Low Level Subtraction} -The low level unsigned subtraction algorithm is very similar to the low level unsigned addition algorithm. The principle difference is that the -unsigned subtraction algorithm requires the result to be positive. That is when computing $a - b$ the condition $\vert a \vert \ge \vert b\vert$ must -be met for this algorithm to function properly. Keep in mind this low level algorithm is not meant to be used in higher level algorithms directly. -This algorithm as will be shown can be used to create functional signed addition and subtraction algorithms. - - -For this algorithm a new variable is required to make the description simpler. Recall from section 1.3.1 that a mp\_digit must be able to represent -the range $0 \le x < 2\beta$ for the algorithms to work correctly. However, it is allowable that a mp\_digit represent a larger range of values. For -this algorithm we will assume that the variable $\gamma$ represents the number of bits available in a -mp\_digit (\textit{this implies $2^{\gamma} > \beta$}). - -For example, the default for LibTomMath is to use a ``unsigned long'' for the mp\_digit ``type'' while $\beta = 2^{28}$. In ISO C an ``unsigned long'' -data type must be able to represent $0 \le x < 2^{32}$ meaning that in this case $\gamma = 32$. - -\newpage\begin{figure}[!here] -\begin{center} -\begin{small} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_sub}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ ($\vert a \vert \ge \vert b \vert$) \\ -\textbf{Output}. The unsigned subtraction $c = \vert a \vert - \vert b \vert$. \\ -\hline \\ -1. $min \leftarrow b.used$ \\ -2. $max \leftarrow a.used$ \\ -3. If $c.alloc < max$ then grow $c$ to hold at least $max$ digits. (\textit{mp\_grow}) \\ -4. If the reallocation failed return(\textit{MP\_MEM}). \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow max$ \\ -7. $u \leftarrow 0$ \\ -8. for $n$ from $0$ to $min - 1$ do \\ -\hspace{3mm}8.1 $c_n \leftarrow a_n - b_n - u$ \\ -\hspace{3mm}8.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ -\hspace{3mm}8.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -9. if $min < max$ then do \\ -\hspace{3mm}9.1 for $n$ from $min$ to $max - 1$ do \\ -\hspace{6mm}9.1.1 $c_n \leftarrow a_n - u$ \\ -\hspace{6mm}9.1.2 $u \leftarrow c_n >> (\gamma - 1)$ \\ -\hspace{6mm}9.1.3 $c_n \leftarrow c_n \mbox{ (mod }\beta\mbox{)}$ \\ -10. if $oldused > max$ then do \\ -\hspace{3mm}10.1 for $n$ from $max$ to $oldused - 1$ do \\ -\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ -11. Clamp excess digits of $c$. (\textit{mp\_clamp}). \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Algorithm s\_mp\_sub} -\end{figure} - -\textbf{Algorithm s\_mp\_sub.} -This algorithm performs the unsigned subtraction of two mp\_int variables under the restriction that the result must be positive. That is when -passing variables $a$ and $b$ the condition that $\vert a \vert \ge \vert b \vert$ must be met for the algorithm to function correctly. This -algorithm is loosely based on algorithm 14.9 \cite[pp. 595]{HAC} and is similar to algorithm S in \cite[pp. 267]{TAOCPV2} as well. As was the case -of the algorithm s\_mp\_add both other references lack discussion concerning various practical details such as when the inputs differ in magnitude. - -The initial sorting of the inputs is trivial in this algorithm since $a$ is guaranteed to have at least the same magnitude of $b$. Steps 1 and 2 -set the $min$ and $max$ variables. Unlike the addition routine there is guaranteed to be no carry which means that the final result can be at -most $max$ digits in length as opposed to $max + 1$. Similar to the addition algorithm the \textbf{used} count of $c$ is copied locally and -set to the maximal count for the operation. - -The subtraction loop that begins on step 8 is essentially the same as the addition loop of algorithm s\_mp\_add except single precision -subtraction is used instead. Note the use of the $\gamma$ variable to extract the carry (\textit{also known as the borrow}) within the subtraction -loops. Under the assumption that two's complement single precision arithmetic is used this will successfully extract the desired carry. - -For example, consider subtracting $0101_2$ from $0100_2$ where $\gamma = 4$ and $\beta = 2$. The least significant bit will force a carry upwards to -the third bit which will be set to zero after the borrow. After the very first bit has been subtracted $4 - 1 \equiv 0011_2$ will remain, When the -third bit of $0101_2$ is subtracted from the result it will cause another carry. In this case though the carry will be forced to propagate all the -way to the most significant bit. - -Recall that $\beta < 2^{\gamma}$. This means that if a carry does occur just before the $lg(\beta)$'th bit it will propagate all the way to the most -significant bit. Thus, the high order bits of the mp\_digit that are not part of the actual digit will either be all zero, or all one. All that -is needed is a single zero or one bit for the carry. Therefore a single logical shift right by $\gamma - 1$ positions is sufficient to extract the -carry. This method of carry extraction may seem awkward but the reason for it becomes apparent when the implementation is discussed. - -If $b$ has a smaller magnitude than $a$ then step 9 will force the carry and copy operation to propagate through the larger input $a$ into $c$. Step -10 will ensure that any leading digits of $c$ above the $max$'th position are zeroed. - -\index{bn\_s\_mp\_sub.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sub.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* low level subtraction (assumes |a| > |b|), HAC pp.595 Algorithm 14.9 */ -018 int -019 s_mp_sub (mp_int * a, mp_int * b, mp_int * c) -020 \{ -021 int olduse, res, min, max; -022 -023 /* find sizes */ -024 min = b->used; -025 max = a->used; -026 -027 /* init result */ -028 if (c->alloc < max) \{ -029 if ((res = mp_grow (c, max)) != MP_OKAY) \{ -030 return res; -031 \} -032 \} -033 olduse = c->used; -034 c->used = max; -035 -036 \{ -037 register mp_digit u, *tmpa, *tmpb, *tmpc; -038 register int i; -039 -040 /* alias for digit pointers */ -041 tmpa = a->dp; -042 tmpb = b->dp; -043 tmpc = c->dp; -044 -045 /* set carry to zero */ -046 u = 0; -047 for (i = 0; i < min; i++) \{ -048 /* T[i] = A[i] - B[i] - U */ -049 *tmpc = *tmpa++ - *tmpb++ - u; -050 -051 /* U = carry bit of T[i] -052 * Note this saves performing an AND operation since -053 * if a carry does occur it will propagate all the way to the -054 * MSB. As a result a single shift is enough to get the carry -055 */ -056 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); -057 -058 /* Clear carry from T[i] */ -059 *tmpc++ &= MP_MASK; -060 \} -061 -062 /* now copy higher words if any, e.g. if A has more digits than B */ -063 for (; i < max; i++) \{ -064 /* T[i] = A[i] - U */ -065 *tmpc = *tmpa++ - u; -066 -067 /* U = carry bit of T[i] */ -068 u = *tmpc >> ((mp_digit)(CHAR_BIT * sizeof (mp_digit) - 1)); -069 -070 /* Clear carry from T[i] */ -071 *tmpc++ &= MP_MASK; -072 \} -073 -074 /* clear digits above used (since we may not have grown result above) */ - -075 for (i = c->used; i < olduse; i++) \{ -076 *tmpc++ = 0; -077 \} -078 \} -079 -080 mp_clamp (c); -081 return MP_OKAY; -082 \} -083 -\end{alltt} -\end{small} - -Line 24 and 25 perform the initial hardcoded sorting of the inputs. In reality the $min$ and $max$ variables are only aliases and are only -used to make the source code easier to read. Again the pointer alias optimization is used within this algorithm. Lines 41, 42 and 43 initialize the aliases for -$a$, $b$ and $c$ respectively. - -The first subtraction loop occurs on lines 46 through 60. The theory behind the subtraction loop is exactly the same as that for -the addition loop. As remarked earlier there is an implementation reason for using the ``awkward'' method of extracting the carry -(\textit{see line 56}). The traditional method for extracting the carry would be to shift by $lg(\beta)$ positions and logically AND -the least significant bit. The AND operation is required because all of the bits above the $\lg(\beta)$'th bit will be set to one after a carry -occurs from subtraction. This carry extraction requires two relatively cheap operations to extract the carry. The other method is to simply -shift the most significant bit to the least significant bit thus extracting the carry with a single cheap operation. This optimization only works on -twos compliment machines which is a safe assumption to make. - -If $a$ has a larger magnitude than $b$ an additional loop (\textit{see lines 63 through 72}) is required to propagate the carry through -$a$ and copy the result to $c$. - -\subsection{High Level Addition} -Now that both lower level addition and subtraction algorithms have been established an effective high level signed addition algorithm can be -established. This high level addition algorithm will be what other algorithms and developers will use to perform addition of mp\_int data -types. - -Recall from section 5.2 that an mp\_int represents an integer with an unsigned mantissa (\textit{the array of digits}) and a \textbf{sign} -flag. A high level addition is actually performed as a series of eight separate cases which can be optimized down to three unique cases. - -\begin{figure}[!here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_add}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The signed addition $c = a + b$. \\ -\hline \\ -1. if $a.sign = b.sign$ then do \\ -\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add})\\ -2. else do \\ -\hspace{3mm}2.1 if $\vert a \vert < \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ -\hspace{6mm}2.1.1 $c.sign \leftarrow b.sign$ \\ -\hspace{6mm}2.1.2 $c \leftarrow \vert b \vert - \vert a \vert$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c.sign \leftarrow a.sign$ \\ -\hspace{6mm}2.2.2 $c \leftarrow \vert a \vert - \vert b \vert$ \\ -3. If any of the lower level operations failed return(\textit{MP\_MEM}) \\ -4. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_add} -\end{figure} - -\textbf{Algorithm mp\_add.} -This algorithm performs the signed addition of two mp\_int variables. There is no reference algorithm to draw upon from either \cite{TAOCPV2} or -\cite{HAC} since they both only provide unsigned operations. The algorithm is fairly straightforward but restricted since subtraction can only -produce positive results. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|} -\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert > \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ -\hline $+$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $+$ & $+$ & No & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $-$ & No & $c = a + b$ & $a.sign$ \\ -\hline &&&&\\ - -\hline $+$ & $-$ & No & $c = b - a$ & $b.sign$ \\ -\hline $-$ & $+$ & No & $c = b - a$ & $b.sign$ \\ - -\hline &&&&\\ - -\hline $+$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline $-$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ - -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Addition Guide Chart} -\label{fig:AddChart} -\end{figure} - -Figure~\ref{fig:AddChart} lists all of the eight possible input combinations and is sorted to show that only three specific cases need to be handled. The -return code of the unsigned operations at step 1.2, 2.1.2 and 2.2.2 are forwarded to step 3 to check for errors. This simplifies the description -of the algorithm considerably and best follows how the implementation actually was achieved. - -Also note how the \textbf{sign} is set before the unsigned addition or subtraction is performed. Recall from the descriptions of algorithms -s\_mp\_add and s\_mp\_sub that the mp\_clamp function is used at the end to trim excess digits. The mp\_clamp algorithm will set the \textbf{sign} -to \textbf{MP\_ZPOS} when the \textbf{used} digit count reaches zero. - -For example, consider performing $-a + a$ with algorithm mp\_add. By the description of the algorithm the sign is set to \textbf{MP\_NEG} which would -produce a result of $-0$. However, since the sign is set first then the unsigned addition is performed the subsequent usage of algorithm mp\_clamp -within algorithm s\_mp\_add will force $-0$ to become $0$. - -\index{bn\_mp\_add.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_add.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* high level addition (handles signs) */ -018 int -019 mp_add (mp_int * a, mp_int * b, mp_int * c) -020 \{ -021 int sa, sb, res; -022 -023 /* get sign of both inputs */ -024 sa = a->sign; -025 sb = b->sign; -026 -027 /* handle two cases, not four */ -028 if (sa == sb) \{ -029 /* both positive or both negative */ -030 /* add their magnitudes, copy the sign */ -031 c->sign = sa; -032 res = s_mp_add (a, b, c); -033 \} else \{ -034 /* one positive, the other negative */ -035 /* subtract the one with the greater magnitude from */ -036 /* the one of the lesser magnitude. The result gets */ -037 /* the sign of the one with the greater magnitude. */ -038 if (mp_cmp_mag (a, b) == MP_LT) \{ -039 c->sign = sb; -040 res = s_mp_sub (b, a, c); -041 \} else \{ -042 c->sign = sa; -043 res = s_mp_sub (a, b, c); -044 \} -045 \} -046 return res; -047 \} -048 -\end{alltt} -\end{small} - -The source code follows the algorithm fairly closely. The most notable new source code addition is the usage of the $res$ integer variable which -is used to pass result of the unsigned operations forward. Unlike in the algorithm, the variable $res$ is merely returned as is without -explicitly checking it and returning the constant \textbf{MP\_OKAY}. The observation is this algorithm will succeed or fail only if the lower -level functions do so. Returning their return code is sufficient. - -\subsection{High Level Subtraction} -The high level signed subtraction algorithm is essentially the same as the high level signed addition algorithm. - -\newpage\begin{figure}[!here] -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_sub}. \\ -\textbf{Input}. Two mp\_ints $a$ and $b$ \\ -\textbf{Output}. The signed subtraction $c = a - b$. \\ -\hline \\ -1. if $a.sign \ne b.sign$ then do \\ -\hspace{3mm}1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{3mm}1.2 $c \leftarrow \vert a \vert + \vert b \vert$ (\textit{s\_mp\_add}) \\ -2. else do \\ -\hspace{3mm}2.1 if $\vert a \vert \ge \vert b \vert$ then do (\textit{mp\_cmp\_mag}) \\ -\hspace{6mm}2.1.1 $c.sign \leftarrow a.sign$ \\ -\hspace{6mm}2.1.2 $c \leftarrow \vert a \vert - \vert b \vert$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c.sign \leftarrow \left \lbrace \begin{array}{ll} - MP\_ZPOS & \mbox{if }a.sign = MP\_NEG \\ - MP\_NEG & \mbox{otherwise} \\ - \end{array} \right .$ \\ -\hspace{6mm}2.2.2 $c \leftarrow \vert b \vert - \vert a \vert$ \\ -3. If any of the lower level operations failed return(\textit{MP\_MEM}). \\ -4. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\caption{Algorithm mp\_sub} -\end{figure} - -\textbf{Algorithm mp\_sub.} -This algorithm performs the signed subtraction of two inputs. Similar to algorithm mp\_add there is no reference in either \cite{TAOCPV2} or -\cite{HAC}. Also this algorithm is restricted by algorithm s\_mp\_sub. The following chart lists the eight possible inputs and -the operations required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|} -\hline \textbf{Sign of $a$} & \textbf{Sign of $b$} & \textbf{$\vert a \vert \ge \vert b \vert $} & \textbf{Unsigned Operation} & \textbf{Result Sign Flag} \\ -\hline $+$ & $-$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $+$ & $-$ & No & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $+$ & Yes & $c = a + b$ & $a.sign$ \\ -\hline $-$ & $+$ & No & $c = a + b$ & $a.sign$ \\ -\hline &&&& \\ -\hline $+$ & $+$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline $-$ & $-$ & Yes & $c = a - b$ & $a.sign$ \\ -\hline &&&& \\ -\hline $+$ & $+$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ -\hline $-$ & $-$ & No & $c = b - a$ & $\mbox{opposite of }a.sign$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Subtraction Guide Chart} -\end{figure} - -Similar to the case of algorithm mp\_add the \textbf{sign} is set first before the unsigned addition or subtraction. That is to prevent the -algorithm from producing $-a - -a = -0$ as a result. - -\index{bn\_mp\_sub.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_sub.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* high level subtraction (handles signs) */ -018 int -019 mp_sub (mp_int * a, mp_int * b, mp_int * c) -020 \{ -021 int sa, sb, res; -022 -023 sa = a->sign; -024 sb = b->sign; -025 -026 if (sa != sb) \{ -027 /* subtract a negative from a positive, OR */ -028 /* subtract a positive from a negative. */ -029 /* In either case, ADD their magnitudes, */ -030 /* and use the sign of the first number. */ -031 c->sign = sa; -032 res = s_mp_add (a, b, c); -033 \} else \{ -034 /* subtract a positive from a positive, OR */ -035 /* subtract a negative from a negative. */ -036 /* First, take the difference between their */ -037 /* magnitudes, then... */ -038 if (mp_cmp_mag (a, b) != MP_LT) \{ -039 /* Copy the sign from the first */ -040 c->sign = sa; -041 /* The first has a larger or equal magnitude */ -042 res = s_mp_sub (a, b, c); -043 \} else \{ -044 /* The result has the *opposite* sign from */ -045 /* the first number. */ -046 c->sign = (sa == MP_ZPOS) ? MP_NEG : MP_ZPOS; -047 /* The second has a larger magnitude */ -048 res = s_mp_sub (b, a, c); -049 \} -050 \} -051 return res; -052 \} -053 -\end{alltt} -\end{small} - -Much like the implementation of algorithm mp\_add the variable $res$ is used to catch the return code of the unsigned addition or subtraction operations -and forward it to the end of the function. On line 38 the ``not equal to'' \textbf{MP\_LT} expression is used to emulate a -``greater than or equal to'' comparison. - -\section{Bit and Digit Shifting} -It is quite common to think of a multiple precision integer as a polynomial in $x$, that is $y = f(\beta)$ where $f(x) = \sum_{i=0}^{n-1} a_i x^i$. -This notation arises within discussion of Montgomery and Diminished Radix Reduction as well as Karatsuba multiplication and squaring. - -In order to facilitate operations on polynomials in $x$ as above a series of simple ``digit'' algorithms have to be established. That is to shift -the digits left or right as well to shift individual bits of the digits left and right. It is important to note that not all ``shift'' operations -are on radix-$\beta$ digits. - -\subsection{Multiplication by Two} - -In a binary system where the radix is a power of two multiplication by two not only arises often in other algorithms it is a fairly efficient -operation to perform. A single precision logical shift left is sufficient to multiply a single digit by two. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul\_2}. \\ -\textbf{Input}. One mp\_int $a$ \\ -\textbf{Output}. $b = 2a$. \\ -\hline \\ -1. If $b.alloc < a.used + 1$ then grow $b$ to hold $a.used + 1$ digits. (\textit{mp\_grow}) \\ -2. If the reallocation failed return(\textit{MP\_MEM}). \\ -3. $oldused \leftarrow b.used$ \\ -4. $b.used \leftarrow a.used$ \\ -5. $r \leftarrow 0$ \\ -6. for $n$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}6.1 $rr \leftarrow a_n >> (lg(\beta) - 1)$ \\ -\hspace{3mm}6.2 $b_n \leftarrow (a_n << 1) + r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}6.3 $r \leftarrow rr$ \\ -7. If $r \ne 0$ then do \\ -\hspace{3mm}7.1 $b_{n + 1} \leftarrow r$ \\ -\hspace{3mm}7.2 $b.used \leftarrow b.used + 1$ \\ -8. If $b.used < oldused - 1$ then do \\ -\hspace{3mm}8.1 for $n$ from $b.used$ to $oldused - 1$ do \\ -\hspace{6mm}8.1.1 $b_n \leftarrow 0$ \\ -9. $b.sign \leftarrow a.sign$ \\ -10. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul\_2} -\end{figure} - -\textbf{Algorithm mp\_mul\_2.} -This algorithm will quickly multiply a mp\_int by two provided $\beta$ is a power of two. Neither \cite{TAOCPV2} nor \cite{HAC} describe such -an algorithm despite the fact it arises often in other algorithms. The algorithm is setup much like the lower level algorithm s\_mp\_add since -it is for all intents and purposes equivalent to the operation $b = \vert a \vert + \vert a \vert$. - -Step 1 and 2 grow the input as required to accomodate the maximum number of \textbf{used} digits in the result. The initial \textbf{used} count -is set to $a.used$ at step 4. Only if there is a final carry will the \textbf{used} count require adjustment. - -Step 6 is an optimization implementation of the addition loop for this specific case. That is since the two values being added together -are the same there is no need to perform two reads from the digits of $a$. Step 6.1 performs a single precision shift on the current digit $a_n$ to -obtain what will be the carry for the next iteration. Step 6.2 calculates the $n$'th digit of the result as single precision shift of $a_n$ plus -the previous carry. Recall from section 5.1 that $a_n << 1$ is equivalent to $a_n \cdot 2$. An iteration of the addition loop is finished with -forwarding the carry to the next iteration. - -Step 7 takes care of any final carry by setting the $a.used$'th digit of the result to the carry and augmenting the \textbf{used} count of $b$. -Step 8 clears any leading digits of $b$ in case it originally had a larger magnitude than $a$. - -\index{bn\_mp\_mul\_2.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* b = a*2 */ -018 int -019 mp_mul_2 (mp_int * a, mp_int * b) -020 \{ -021 int x, res, oldused; -022 -023 /* grow to accomodate result */ -024 if (b->alloc < a->used + 1) \{ -025 if ((res = mp_grow (b, a->used + 1)) != MP_OKAY) \{ -026 return res; -027 \} -028 \} -029 -030 oldused = b->used; -031 b->used = a->used; -032 -033 \{ -034 register mp_digit r, rr, *tmpa, *tmpb; -035 -036 /* alias for source */ -037 tmpa = a->dp; -038 -039 /* alias for dest */ -040 tmpb = b->dp; -041 -042 /* carry */ -043 r = 0; -044 for (x = 0; x < a->used; x++) \{ -045 -046 /* get what will be the *next* carry bit from the -047 * MSB of the current digit -048 */ -049 rr = *tmpa >> ((mp_digit)(DIGIT_BIT - 1)); -050 -051 /* now shift up this digit, add in the carry [from the previous] */ -052 *tmpb++ = ((*tmpa++ << ((mp_digit)1)) | r) & MP_MASK; -053 -054 /* copy the carry that would be from the source -055 * digit into the next iteration -056 */ -057 r = rr; -058 \} -059 -060 /* new leading digit? */ -061 if (r != 0) \{ -062 /* add a MSB which is always 1 at this point */ -063 *tmpb = 1; -064 ++b->used; -065 \} -066 -067 /* now zero any excess digits on the destination -068 * that we didn't write to -069 */ -070 tmpb = b->dp + b->used; -071 for (x = b->used; x < oldused; x++) \{ -072 *tmpb++ = 0; -073 \} -074 \} -075 b->sign = a->sign; -076 return MP_OKAY; -077 \} -\end{alltt} -\end{small} - -This implementation is essentially an optimized implementation of s\_mp\_add for the case of doubling an input. The only noteworthy difference -is the use of the logical shift operator on line 52 to perform a single precision doubling. - -\subsection{Division by Two} -A division by two can just as easily be accomplished with a logical shift right as multiplication by two can be with a logical shift left. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_div\_2}. \\ -\textbf{Input}. One mp\_int $a$ \\ -\textbf{Output}. $b = a/2$. \\ -\hline \\ -1. If $b.alloc < a.used$ then grow $b$ to hold $a.used$ digits. (\textit{mp\_grow}) \\ -2. If the reallocation failed return(\textit{MP\_MEM}). \\ -3. $oldused \leftarrow b.used$ \\ -4. $b.used \leftarrow a.used$ \\ -5. $r \leftarrow 0$ \\ -6. for $n$ from $b.used - 1$ to $0$ do \\ -\hspace{3mm}6.1 $rr \leftarrow a_n \mbox{ (mod }2\mbox{)}$\\ -\hspace{3mm}6.2 $b_n \leftarrow (a_n >> 1) + (r << (lg(\beta) - 1)) \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}6.3 $r \leftarrow rr$ \\ -7. If $b.used < oldused - 1$ then do \\ -\hspace{3mm}7.1 for $n$ from $b.used$ to $oldused - 1$ do \\ -\hspace{6mm}7.1.1 $b_n \leftarrow 0$ \\ -8. $b.sign \leftarrow a.sign$ \\ -9. Clamp excess digits of $b$. (\textit{mp\_clamp}) \\ -10. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_div\_2} -\end{figure} - -\textbf{Algorithm mp\_div\_2.} -This algorithm will divide an mp\_int by two using logical shifts to the right. Like mp\_mul\_2 it uses a modified low level addition -core as the basis of the algorithm. Unlike mp\_mul\_2 the shift operations work from the leading digit to the trailing digit. The algorithm -could be written to work from the trailing digit to the leading digit however, it would have to stop one short of $a.used - 1$ digits to prevent -reading past the end of the array of digits. - -Essentially the loop at step 6 is similar to that of mp\_mul\_2 except the logical shifts go in the opposite direction and the carry is at the -least significant bit not the most significant bit. - -\index{bn\_mp\_div\_2.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* b = a/2 */ -018 int -019 mp_div_2 (mp_int * a, mp_int * b) -020 \{ -021 int x, res, oldused; -022 -023 /* copy */ -024 if (b->alloc < a->used) \{ -025 if ((res = mp_grow (b, a->used)) != MP_OKAY) \{ -026 return res; -027 \} -028 \} -029 -030 oldused = b->used; -031 b->used = a->used; -032 \{ -033 register mp_digit r, rr, *tmpa, *tmpb; -034 -035 /* source alias */ -036 tmpa = a->dp + b->used - 1; -037 -038 /* dest alias */ -039 tmpb = b->dp + b->used - 1; -040 -041 /* carry */ -042 r = 0; -043 for (x = b->used - 1; x >= 0; x--) \{ -044 /* get the carry for the next iteration */ -045 rr = *tmpa & 1; -046 -047 /* shift the current digit, add in carry and store */ -048 *tmpb-- = (*tmpa-- >> 1) | (r << (DIGIT_BIT - 1)); -049 -050 /* forward carry to next iteration */ -051 r = rr; -052 \} -053 -054 /* zero excess digits */ -055 tmpb = b->dp + b->used; -056 for (x = b->used; x < oldused; x++) \{ -057 *tmpb++ = 0; -058 \} -059 \} -060 b->sign = a->sign; -061 mp_clamp (b); -062 return MP_OKAY; -063 \} -\end{alltt} -\end{small} - -\section{Polynomial Basis Operations} -Recall from section 5.3 that any integer can be represented as a polynomial in $x$ as $y = f(\beta)$. Such a representation is also known as -the polynomial basis \cite[pp. 48]{ROSE}. Given such a notation a multiplication or division by $x$ amounts to shifting whole digits a single -place. The need for such operations arises in several other higher level algorithms such as Barrett and Montgomery reduction, integer -division and Karatsuba multiplication. - -Converting from an array of digits to polynomial basis is very simple. Consider the integer $y \equiv (a_2, a_1, a_0)_{\beta}$ and recall that -$y = \sum_{i=0}^{2} a_i \beta^i$. Simply replace $\beta$ with $x$ and the expression is in polynomial basis. For example, $f(x) = 8x + 9$ is the -polynomial basis representation for $89$ using radix ten. That is, $f(10) = 8(10) + 9 = 89$. - -\subsection{Multiplication by $x$} - -Given a polynomial in $x$ such as $f(x) = a_n x^n + a_{n-1} x^{n-1} + ... + a_0$ multiplying by $x$ amounts to shifting the coefficients up one -degree. In this case $f(x) \cdot x = a_n x^{n+1} + a_{n-1} x^n + ... + a_0 x$. From a scalar basis point of view multiplying by $x$ is equivalent to -multiplying by the integer $\beta$. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_lshd}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $a \leftarrow a \cdot \beta^b$ (equivalent to multiplication by $x^b$). \\ -\hline \\ -1. If $b \le 0$ then return(\textit{MP\_OKAY}). \\ -2. If $a.alloc < a.used + b$ then grow $a$ to at least $a.used + b$ digits. (\textit{mp\_grow}). \\ -3. If the reallocation failed return(\textit{MP\_MEM}). \\ -4. $a.used \leftarrow a.used + b$ \\ -5. $i \leftarrow a.used - 1$ \\ -6. $j \leftarrow a.used - 1 - b$ \\ -7. for $n$ from $a.used - 1$ to $b$ do \\ -\hspace{3mm}7.1 $a_{i} \leftarrow a_{j}$ \\ -\hspace{3mm}7.2 $i \leftarrow i - 1$ \\ -\hspace{3mm}7.3 $j \leftarrow j - 1$ \\ -8. for $n$ from 0 to $b - 1$ do \\ -\hspace{3mm}8.1 $a_n \leftarrow 0$ \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_lshd} -\end{figure} - -\textbf{Algorithm mp\_lshd.} -This algorithm multiplies an mp\_int by the $b$'th power of $x$. This is equivalent to multiplying by $\beta^b$. The algorithm differs -from the other algorithms presented so far as it performs the operation in place instead storing the result in a separate location. The -motivation behind this change is due to the way this function is typically used. Algorithms such as mp\_add store the result in an optionally -different third mp\_int because the original inputs are often still required. Algorithm mp\_lshd (\textit{and similarly algorithm mp\_rshd}) is -typically used on values where the original value is no longer required. The algorithm will return success immediately if -$b \le 0$ since the rest of algorithm is only valid when $b > 0$. - -First the destination $a$ is grown as required to accomodate the result. The counters $i$ and $j$ are used to form a \textit{sliding window} over -the digits of $a$ of length $b$. The head of the sliding window is at $i$ (\textit{the leading digit}) and the tail at $j$ (\textit{the trailing digit}). -The loop on step 7 copies the digit from the tail to the head. In each iteration the window is moved down one digit. The last loop on -step 8 sets the lower $b$ digits to zero. - -\newpage -\begin{center} -\begin{figure}[here] -\includegraphics{pics/sliding_window} -\caption{Sliding Window Movement} -\end{figure} -\end{center} - -\index{bn\_mp\_lshd.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_lshd.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* shift left a certain amount of digits */ -018 int -019 mp_lshd (mp_int * a, int b) -020 \{ -021 int x, res; -022 -023 /* if its less than zero return */ -024 if (b <= 0) \{ -025 return MP_OKAY; -026 \} -027 -028 /* grow to fit the new digits */ -029 if (a->alloc < a->used + b) \{ -030 if ((res = mp_grow (a, a->used + b)) != MP_OKAY) \{ -031 return res; -032 \} -033 \} -034 -035 \{ -036 register mp_digit *top, *bottom; -037 -038 /* increment the used by the shift amount then copy upwards */ -039 a->used += b; -040 -041 /* top */ -042 top = a->dp + a->used - 1; -043 -044 /* base */ -045 bottom = a->dp + a->used - 1 - b; -046 -047 /* much like mp_rshd this is implemented using a sliding window -048 * except the window goes the otherway around. Copying from -049 * the bottom to the top. see bn_mp_rshd.c for more info. -050 */ -051 for (x = a->used - 1; x >= b; x--) \{ -052 *top-- = *bottom--; -053 \} -054 -055 /* zero the lower digits */ -056 top = a->dp; -057 for (x = 0; x < b; x++) \{ -058 *top++ = 0; -059 \} -060 \} -061 return MP_OKAY; -062 \} -\end{alltt} -\end{small} - -The if statement on line 24 ensures that the $b$ variable is greater than zero. The \textbf{used} count is incremented by $b$ before -the copy loop begins. This elminates the need for an additional variable in the for loop. The variable $top$ on line 42 is an alias -for the leading digit while $bottom$ on line 45 is an alias for the trailing edge. The aliases form a window of exactly $b$ digits -over the input. - -\subsection{Division by $x$} - -Division by powers of $x$ is easily achieved by shifting the digits right and removing any that will end up to the right of the zero'th digit. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_rshd}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $a \leftarrow a / \beta^b$ (Divide by $x^b$). \\ -\hline \\ -1. If $b \le 0$ then return. \\ -2. If $a.used \le b$ then do \\ -\hspace{3mm}2.1 Zero $a$. (\textit{mp\_zero}). \\ -\hspace{3mm}2.2 Return. \\ -3. $i \leftarrow 0$ \\ -4. $j \leftarrow b$ \\ -5. for $n$ from 0 to $a.used - b - 1$ do \\ -\hspace{3mm}5.1 $a_i \leftarrow a_j$ \\ -\hspace{3mm}5.2 $i \leftarrow i + 1$ \\ -\hspace{3mm}5.3 $j \leftarrow j + 1$ \\ -6. for $n$ from $a.used - b$ to $a.used - 1$ do \\ -\hspace{3mm}6.1 $a_n \leftarrow 0$ \\ -7. $a.used \leftarrow a.used - b$ \\ -8. Return. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_rshd} -\end{figure} - -\textbf{Algorithm mp\_rshd.} -This algorithm divides the input in place by the $b$'th power of $x$. It is analogous to dividing by a $\beta^b$ but much quicker since -it does not require single precision division. This algorithm does not actually return an error code as it cannot fail. - -If the input $b$ is less than one the algorithm quickly returns without performing any work. If the \textbf{used} count is less than or equal -to the shift count $b$ then it will simply zero the input and return. - -After the trivial cases of inputs have been handled the sliding window is setup. Much like the case of algorithm mp\_lshd a sliding window that -is $b$ digits wide is used to copy the digits. Unlike mp\_lshd the window slides in the opposite direction from the trailing to the leading digit. -Also the digits are copied from the leading to the trailing edge. - -Once the window copy is complete the upper digits must be zeroed and the \textbf{used} count decremented. - -\index{bn\_mp\_rshd.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_rshd.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* shift right a certain amount of digits */ -018 void -019 mp_rshd (mp_int * a, int b) -020 \{ -021 int x; -022 -023 /* if b <= 0 then ignore it */ -024 if (b <= 0) \{ -025 return; -026 \} -027 -028 /* if b > used then simply zero it and return */ -029 if (a->used <= b) \{ -030 mp_zero (a); -031 return; -032 \} -033 -034 \{ -035 register mp_digit *bottom, *top; -036 -037 /* shift the digits down */ -038 -039 /* bottom */ -040 bottom = a->dp; -041 -042 /* top [offset into digits] */ -043 top = a->dp + b; -044 -045 /* this is implemented as a sliding window where -046 * the window is b-digits long and digits from -047 * the top of the window are copied to the bottom -048 * -049 * e.g. -050 -051 b-2 | b-1 | b0 | b1 | b2 | ... | bb | ----> -052 /\symbol{92} | ----> -053 \symbol{92}-------------------/ ----> -054 */ -055 for (x = 0; x < (a->used - b); x++) \{ -056 *bottom++ = *top++; -057 \} -058 -059 /* zero the top digits */ -060 for (; x < a->used; x++) \{ -061 *bottom++ = 0; -062 \} -063 \} -064 -065 /* remove excess digits */ -066 a->used -= b; -067 \} -\end{alltt} -\end{small} - -The only noteworthy element of this routine is the lack of a return type. - --- Will update later to give it a return type...Tom - -\section{Powers of Two} - -Now that algorithms for moving single bits as well as whole digits exist algorithms for moving the ``in between'' distances are required. For -example, to quickly multiply by $2^k$ for any $k$ without using a full multiplier algorithm would prove useful. Instead of performing single -shifts $k$ times to achieve a multiplication by $2^{\pm k}$ a mixture of whole digit shifting and partial digit shifting is employed. - -\subsection{Multiplication by Power of Two} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot 2^b$. \\ -\hline \\ -1. $c \leftarrow a$. (\textit{mp\_copy}) \\ -2. If $c.alloc < c.used + \lfloor b / lg(\beta) \rfloor + 2$ then grow $c$ accordingly. \\ -3. If the reallocation failed return(\textit{MP\_MEM}). \\ -4. If $b \ge lg(\beta)$ then \\ -\hspace{3mm}4.1 $c \leftarrow c \cdot \beta^{\lfloor b / lg(\beta) \rfloor}$ (\textit{mp\_lshd}). \\ -\hspace{3mm}4.2 If step 4.1 failed return(\textit{MP\_MEM}). \\ -5. $d \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -6. If $d \ne 0$ then do \\ -\hspace{3mm}6.1 $mask \leftarrow 2^d$ \\ -\hspace{3mm}6.2 $r \leftarrow 0$ \\ -\hspace{3mm}6.3 for $n$ from $0$ to $c.used - 1$ do \\ -\hspace{6mm}6.3.1 $rr \leftarrow c_n >> (lg(\beta) - d) \mbox{ (mod }mask\mbox{)}$ \\ -\hspace{6mm}6.3.2 $c_n \leftarrow (c_n << d) + r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ -\hspace{3mm}6.4 If $r > 0$ then do \\ -\hspace{6mm}6.4.1 $c_{c.used} \leftarrow r$ \\ -\hspace{6mm}6.4.2 $c.used \leftarrow c.used + 1$ \\ -7. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul\_2d} -\end{figure} - -\textbf{Algorithm mp\_mul\_2d.} -This algorithm multiplies $a$ by $2^b$ and stores the result in $c$. The algorithm uses algorithm mp\_lshd and a derivative of algorithm mp\_mul\_2 to -quickly compute the product. - -First the algorithm will multiply $a$ by $x^{\lfloor b / lg(\beta) \rfloor}$ which will ensure that the remainder multiplicand is less than -$\beta$. For example, if $b = 37$ and $\beta = 2^{28}$ then this step will multiply by $x$ leaving a multiplication by $2^{37 - 28} = 2^{9}$ -left. - -After the digits have been shifted appropriately at most $lg(\beta) - 1$ shifts are left to perform. Step 5 calculates the number of remaining shifts -required. If it is non-zero a modified shift loop is used to calculate the remaining product. -Essentially the loop is a generic version of algorith mp\_mul2 designed to handle any shift count in the range $1 \le x < lg(\beta)$. The $mask$ -variable is used to extract the upper $d$ bits to form the carry for the next iteration. - -This algorithm is loosely measured as a $O(2n)$ algorithm which means that if the input is $n$-digits that it takes $2n$ ``time'' to -complete. It is possible to optimize this algorithm down to a $O(n)$ algorithm at a cost of making the algorithm slightly harder to follow. - -\index{bn\_mp\_mul\_2d.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_mul\_2d.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* NOTE: This routine requires updating. For instance the c->used = c->all - oc bit -018 is wrong. We should just shift c->used digits then set the carry as c->d - p[c->used] = carry -019 -020 To be fixed for LTM 0.18 -021 */ -022 -023 /* shift left by a certain bit count */ -024 int -025 mp_mul_2d (mp_int * a, int b, mp_int * c) -026 \{ -027 mp_digit d; -028 int res; -029 -030 /* copy */ -031 if (a != c) \{ -032 if ((res = mp_copy (a, c)) != MP_OKAY) \{ -033 return res; -034 \} -035 \} -036 -037 if (c->alloc < (int)(c->used + b/DIGIT_BIT + 2)) \{ -038 if ((res = mp_grow (c, c->used + b / DIGIT_BIT + 2)) != MP_OKAY) \{ -039 return res; -040 \} -041 \} -042 -043 /* shift by as many digits in the bit count */ -044 if (b >= (int)DIGIT_BIT) \{ -045 if ((res = mp_lshd (c, b / DIGIT_BIT)) != MP_OKAY) \{ -046 return res; -047 \} -048 \} -049 c->used = c->alloc; -050 -051 /* shift any bit count < DIGIT_BIT */ -052 d = (mp_digit) (b % DIGIT_BIT); -053 if (d != 0) \{ -054 register mp_digit *tmpc, mask, r, rr; -055 register int x; -056 -057 /* bitmask for carries */ -058 mask = (((mp_digit)1) << d) - 1; -059 -060 /* alias */ -061 tmpc = c->dp; -062 -063 /* carry */ -064 r = 0; -065 for (x = 0; x < c->used; x++) \{ -066 /* get the higher bits of the current word */ -067 rr = (*tmpc >> (DIGIT_BIT - d)) & mask; -068 -069 /* shift the current word and OR in the carry */ -070 *tmpc = ((*tmpc << d) | r) & MP_MASK; -071 ++tmpc; -072 -073 /* set the carry to the carry bits of the current word */ -074 r = rr; -075 \} -076 \} -077 mp_clamp (c); -078 return MP_OKAY; -079 \} -\end{alltt} -\end{small} - -Notes to be revised when code is updated. -- Tom - -\subsection{Division by Power of Two} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_div\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow \lfloor a / 2^b \rfloor, d \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ -\hline \\ -1. If $b \le 0$ then do \\ -\hspace{3mm}1.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ -\hspace{3mm}1.2 $d \leftarrow 0$ (\textit{mp\_zero}) \\ -\hspace{3mm}1.3 Return(\textit{MP\_OKAY}). \\ -2. $c \leftarrow a$ \\ -3. $d \leftarrow a \mbox{ (mod }2^b\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -4. If $b \ge lg(\beta)$ then do \\ -\hspace{3mm}4.1 $c \leftarrow \lfloor c/\beta^{\lfloor b/lg(\beta) \rfloor} \rfloor$ (\textit{mp\_rshd}). \\ -5. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -6. If $k \ne 0$ then do \\ -\hspace{3mm}6.1 $mask \leftarrow 2^k$ \\ -\hspace{3mm}6.2 $r \leftarrow 0$ \\ -\hspace{3mm}6.3 for $n$ from $c.used - 1$ to $0$ do \\ -\hspace{6mm}6.3.1 $rr \leftarrow c_n \mbox{ (mod }mask\mbox{)}$ \\ -\hspace{6mm}6.3.2 $c_n \leftarrow (c_n >> k) + (r << (lg(\beta) - k))$ \\ -\hspace{6mm}6.3.3 $r \leftarrow rr$ \\ -7. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ -8. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_div\_2d} -\end{figure} - -\textbf{Algorithm mp\_div\_2d.} -This algorithm will divide an input $a$ by $2^b$ and produce the quotient and remainder. The algorithm is designed much like algorithm -mp\_mul\_2d by first using whole digit shifts then single precision shifts. This algorithm will also produce the remainder of the division -by using algorithm mp\_mod\_2d. - -\index{bn\_mp\_div\_2d.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_div\_2d.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* shift right by a certain bit count (store quotient in c, optional remaind - er in d) */ -018 int -019 mp_div_2d (mp_int * a, int b, mp_int * c, mp_int * d) -020 \{ -021 mp_digit D, r, rr; -022 int x, res; -023 mp_int t; -024 -025 -026 /* if the shift count is <= 0 then we do no work */ -027 if (b <= 0) \{ -028 res = mp_copy (a, c); -029 if (d != NULL) \{ -030 mp_zero (d); -031 \} -032 return res; -033 \} -034 -035 if ((res = mp_init (&t)) != MP_OKAY) \{ -036 return res; -037 \} -038 -039 /* get the remainder */ -040 if (d != NULL) \{ -041 if ((res = mp_mod_2d (a, b, &t)) != MP_OKAY) \{ -042 mp_clear (&t); -043 return res; -044 \} -045 \} -046 -047 /* copy */ -048 if ((res = mp_copy (a, c)) != MP_OKAY) \{ -049 mp_clear (&t); -050 return res; -051 \} -052 -053 /* shift by as many digits in the bit count */ -054 if (b >= (int)DIGIT_BIT) \{ -055 mp_rshd (c, b / DIGIT_BIT); -056 \} -057 -058 /* shift any bit count < DIGIT_BIT */ -059 D = (mp_digit) (b % DIGIT_BIT); -060 if (D != 0) \{ -061 register mp_digit *tmpc, mask; -062 -063 /* mask */ -064 mask = (((mp_digit)1) << D) - 1; -065 -066 /* alias */ -067 tmpc = c->dp + (c->used - 1); -068 -069 /* carry */ -070 r = 0; -071 for (x = c->used - 1; x >= 0; x--) \{ -072 /* get the lower bits of this word in a temp */ -073 rr = *tmpc & mask; -074 -075 /* shift the current word and mix in the carry bits from the previous - word */ -076 *tmpc = (*tmpc >> D) | (r << (DIGIT_BIT - D)); -077 --tmpc; -078 -079 /* set the carry to the carry bits of the current word found above */ -080 r = rr; -081 \} -082 \} -083 mp_clamp (c); -084 if (d != NULL) \{ -085 mp_exch (&t, d); -086 \} -087 mp_clear (&t); -088 return MP_OKAY; -089 \} -\end{alltt} -\end{small} - -The implementation of algorithm mp\_div\_2d is slightly different than the algorithm specifies. The remainder $d$ may be optionally -ignored by passing \textbf{NULL} as the pointer to the mp\_int variable. The temporary mp\_int variable $t$ is used to hold the -result of the remainder operation until the end. This allows $d$ and $a$ to represent the same mp\_int without modifying $a$ before -the quotient is obtained. - -The remainder of the source code is essentially the same as the source code for mp\_mul\_2d. (-- Fix this paragraph up later, Tom). - -\subsection{Remainder of Division by Power of Two} - -The last algorithm in the series of polynomial basis power of two algorithms is calculating the remainder of division by $2^b$. This -algorithm benefits from the fact that in twos complement arithmetic $a \mbox{ (mod }2^b\mbox{)}$ is the same as $a$ AND $2^b - 1$. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mod\_2d}. \\ -\textbf{Input}. One mp\_int $a$ and an integer $b$ \\ -\textbf{Output}. $c \leftarrow a \mbox{ (mod }2^b\mbox{)}$. \\ -\hline \\ -1. If $b \le 0$ then do \\ -\hspace{3mm}1.1 $c \leftarrow 0$ (\textit{mp\_zero}) \\ -\hspace{3mm}1.2 Return(\textit{MP\_OKAY}). \\ -2. If $b > a.used \cdot lg(\beta)$ then do \\ -\hspace{3mm}2.1 $c \leftarrow a$ (\textit{mp\_copy}) \\ -\hspace{3mm}2.2 Return the result of step 2.1. \\ -3. $c \leftarrow a$ \\ -4. If step 3 failed return(\textit{MP\_MEM}). \\ -5. for $n$ from $\lceil b / lg(\beta) \rceil$ to $c.used$ do \\ -\hspace{3mm}5.1 $c_n \leftarrow 0$ \\ -6. $k \leftarrow b \mbox{ (mod }lg(\beta)\mbox{)}$ \\ -7. $c_{\lfloor b / lg(\beta) \rfloor} \leftarrow c_{\lfloor b / lg(\beta) \rfloor} \mbox{ (mod }2^{k}\mbox{)}$. \\ -8. Clamp excess digits of $c$. (\textit{mp\_clamp}) \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mod\_2d} -\end{figure} - -\textbf{Algorithm mp\_mod\_2d.} -This algorithm will quickly calculate the value of $a \mbox{ (mod }2^b\mbox{)}$. First if $b$ is less than or equal to zero the -result is set to zero. If $b$ is greater than the number of bits in $a$ then it simply copies $a$ to $c$ and returns. Otherwise, $a$ -is copied to $b$, leading digits are removed and the remaining leading digit is trimed to the exact bit count. - -\index{bn\_mp\_mod\_2d.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_mod\_2d.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* calc a value mod 2\b */ -018 int -019 mp_mod_2d (mp_int * a, int b, mp_int * c) -020 \{ -021 int x, res; -022 -023 -024 /* if b is <= 0 then zero the int */ -025 if (b <= 0) \{ -026 mp_zero (c); -027 return MP_OKAY; -028 \} -029 -030 /* if the modulus is larger than the value than return */ -031 if (b > (int) (a->used * DIGIT_BIT)) \{ -032 res = mp_copy (a, c); -033 return res; -034 \} -035 -036 /* copy */ -037 if ((res = mp_copy (a, c)) != MP_OKAY) \{ -038 return res; -039 \} -040 -041 /* zero digits above the last digit of the modulus */ -042 for (x = (b / DIGIT_BIT) + ((b % DIGIT_BIT) == 0 ? 0 : 1); x < c->used; x+ - +) \{ -043 c->dp[x] = 0; -044 \} -045 /* clear the digit that is not completely outside/inside the modulus */ -046 c->dp[b / DIGIT_BIT] &= -047 (mp_digit) ((((mp_digit) 1) << (((mp_digit) b) % DIGIT_BIT)) - ((mp_digi - t) 1)); -048 mp_clamp (c); -049 return MP_OKAY; -050 \} -\end{alltt} -\end{small} - --- Add comments later, Tom. - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ] $ & Devise an algorithm that performs $a \cdot 2^b$ for generic values of $b$ \\ - & in $O(n)$ time. \\ - &\\ -$\left [ 3 \right ] $ & Devise an efficient algorithm to multiply by small low hamming \\ - & weight values such as $3$, $5$ and $9$. Extend it to handle all values \\ - & upto $64$ with a hamming weight less than three. \\ - &\\ -$\left [ 2 \right ] $ & Modify the preceding algorithm to handle values of the form \\ - & $2^k - 1$ as well. \\ - &\\ -$\left [ 3 \right ] $ & Using only algorithms mp\_mul\_2, mp\_div\_2 and mp\_add create an \\ - & algorithm to multiply two integers in roughly $O(2n^2)$ time for \\ - & any $n$-bit input. Note that the time of addition is ignored in the \\ - & calculation. \\ - & \\ -$\left [ 5 \right ] $ & Improve the previous algorithm to have a working time of at most \\ - & $O \left (2^{(k-1)}n + \left ({2n^2 \over k} \right ) \right )$ for an appropriate choice of $k$. Again ignore \\ - & the cost of addition. \\ - & \\ -$\left [ 2 \right ] $ & Devise a chart to find optimal values of $k$ for the previous problem \\ - & for $n = 64 \ldots 1024$ in steps of $64$. \\ - & \\ -$\left [ 2 \right ] $ & Using only algorithms mp\_abs and mp\_sub devise another method for \\ - & calculating the result of a signed comparison. \\ - & -\end{tabular} - -\chapter{Multiplication and Squaring} -\section{The Multipliers} -For most number theoretic problems including certain public key cryptographic algorithms, the ``multipliers'' form the most important subset of -algorithms of any multiple precision integer package. The set of multiplier algorithms include integer multiplication, squaring and modular reduction -where in each of the algorithms single precision multiplication is the dominant operation performed. This chapter will discuss integer multiplication -and squaring, leaving modular reductions for the subsequent chapter. - -The importance of the multiplier algorithms is for the most part driven by the fact that certain popular public key algorithms are based on modular -exponentiation, that is computing $d \equiv a^b \mbox{ (mod }c\mbox{)}$ for some arbitrary choice of $a$, $b$, $c$ and $d$. During a modular -exponentiation the majority\footnote{Roughly speaking a modular exponentiation will spend about 40\% of the time performing modular reductions, -35\% of the time performing squaring and 25\% of the time performing multiplications.} of the processor time is spent performing single precision -multiplications. - -For centuries general purpose multiplication has required a lengthly $O(n^2)$ process, whereby each digit of one multiplicand has to be multiplied -against every digit of the other multiplicand. Traditional long-hand multiplication is based on this process; while the techniques can differ the -overall algorithm used is essentially the same. Only ``recently'' have faster algorithms been studied. First Karatsuba multiplication was discovered in -1962. This algorithm can multiply two numbers with considerably fewer single precision multiplications when compared to the long-hand approach. -This technique led to the discovery of polynomial basis algorithms (\textit{good reference?}) and subquently Fourier Transform based solutions. - -\section{Multiplication} -\subsection{The Baseline Multiplication} -\index{baseline multiplication} -Computing the product of two integers in software can be achieved using a trivial adaptation of the standard $O(n^2)$ long-hand multiplication -algorithm that school children are taught. The algorithm is considered an $O(n^2)$ algoritn since for two $n$-digit inputs $n^2$ single precision -multiplications are required. More specifically for a $m$ and $n$ digit input $m \cdot n$ single precision multiplications are required. To -simplify most discussions, it will be assumed that the inputs have comparable number of digits. - -The ``baseline multiplication'' algorithm is designed to act as the ``catch-all'' algorithm, only to be used when the faster algorithms cannot be -used. This algorithm does not use any particularly interesting optimizations and should ideally be avoided if possible. One important -facet of this algorithm, is that it has been modified to only produce a certain amount of output digits as resolution. The importance of this -modification will become evident during the discussion of Barrett modular reduction. Recall that for a $n$ and $m$ digit input the product -will be at most $n + m$ digits. Therefore, this algorithm can be reduced to a full multiplier by having it produce $n + m$ digits of the product. - -Recall from sub-section 5.2.2 the definition of $\gamma$ as the number of bits in the type \textbf{mp\_digit}. We shall now extend the variable set to -include $\alpha$ which shall represent the number of bits in the type \textbf{mp\_word}. This implies that $2^{\alpha} > 2 \cdot \beta^2$. The -constant $\delta = 2^{\alpha - 2lg(\beta)}$ will represent the maximal weight of any column in a product (\textit{see sub-section 6.2.2 for more information}). - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_mul\_digs}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ -\hline \\ -1. If min$(a.used, b.used) < \delta$ then do \\ -\hspace{3mm}1.1 Calculate $c = \vert a \vert \cdot \vert b \vert$ by the Comba method (\textit{see algorithm~\ref{fig:COMBAMULT}}). \\ -\hspace{3mm}1.2 Return the result of step 1.1 \\ -\\ -Allocate and initialize a temporary mp\_int. \\ -2. Init $t$ to be of size $digs$ \\ -3. If step 2 failed return(\textit{MP\_MEM}). \\ -4. $t.used \leftarrow digs$ \\ -\\ -Compute the product. \\ -5. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}5.1 $u \leftarrow 0$ \\ -\hspace{3mm}5.2 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ -\hspace{3mm}5.3 If $pb < 1$ then goto step 6. \\ -\hspace{3mm}5.4 for $iy$ from $0$ to $pb - 1$ do \\ -\hspace{6mm}5.4.1 $\hat r \leftarrow t_{iy + ix} + a_{ix} \cdot b_{iy} + u$ \\ -\hspace{6mm}5.4.2 $t_{iy + ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}5.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}5.5 if $ix + pb < digs$ then do \\ -\hspace{6mm}5.5.1 $t_{ix + pb} \leftarrow u$ \\ -6. Clamp excess digits of $t$. \\ -7. Swap $c$ with $t$ \\ -8. Clear $t$ \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_mul\_digs} -\end{figure} - -\textbf{Algorithm s\_mp\_mul\_digs.} -This algorithm computes the unsigned product of two inputs $a$ and $b$, limited to an output precision of $digs$ digits. While it may seem -a bit awkward to modify the function from its simple $O(n^2)$ description, the usefulness of partial multipliers will arise in a subsequent -algorithm. The algorithm is loosely based on algorithm 14.12 from \cite[pp. 595]{HAC} and is similar to Algorithm M of Knuth \cite[pp. 268]{TAOCPV2}. -Algorithm s\_mp\_mul\_digs differs from these cited references since it can produce a variable output precision regardless of the precision of the -inputs. - -The first thing this algorithm checks for is whether a Comba multiplier can be used instead. If the minimum digit count of either -input is less than $\delta$, then the Comba method may be used instead. After the Comba method is ruled out, the baseline algorithm begins. A -temporary mp\_int variable $t$ is used to hold the intermediate result of the product. This allows the algorithm to be used to -compute products when either $a = c$ or $b = c$ without overwriting the inputs. - -All of step 5 is the infamous $O(n^2)$ multiplication loop slightly modified to only produce upto $digs$ digits of output. The $pb$ variable -is given the count of digits to read from $b$ inside the nested loop. If $pb \le 1$ then no more output digits can be produced and the algorithm -will exit the loop. The best way to think of the loops are as a series of $pb \times 1$ multiplications. That is, in each pass of the -innermost loop $a_{ix}$ is multiplied against $b$ and the result is added (\textit{with an appropriate shift}) to $t$. - -For example, consider multiplying $576$ by $241$. That is equivalent to computing $10^0(1)(576) + 10^1(4)(576) + 10^2(2)(576)$ which is best -visualized in the following table. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{|c|c|c|c|c|c|l|} -\hline && & 5 & 7 & 6 & \\ -\hline $\times$&& & 2 & 4 & 1 & \\ -\hline &&&&&&\\ - && & 5 & 7 & 6 & $10^0(1)(576)$ \\ - &2 & 3 & 6 & 1 & 6 & $10^1(4)(576) + 10^0(1)(576)$ \\ - 1 & 3 & 8 & 8 & 1 & 6 & $10^2(2)(576) + 10^1(4)(576) + 10^0(1)(576)$ \\ -\hline -\end{tabular} -\end{center} -\caption{Long-Hand Multiplication Diagram} -\end{figure} - -Each row of the product is added to the result after being shifted to the left (\textit{multiplied by a power of the radix}) by the appropriate -count. That is in pass $ix$ of the inner loop the product is added starting at the $ix$'th digit of the reult. - -Step 5.4.1 introduces the hat symbol (\textit{e.g. $\hat r$}) which represents a double precision variable. The multiplication on that step -is assumed to be a double wide output single precision multiplication. That is, two single precision variables are multiplied to produce a -double precision result. The step is somewhat optimized from a long-hand multiplication algorithm because the carry from the addition in step -5.4.1 is propagated through the nested loop. If the carry was not propagated immediately it would overflow the single precision digit -$t_{ix+iy}$ and the result would be lost. - -At step 5.5 the nested loop is finished and any carry that was left over should be forwarded. The carry does not have to be added to the $ix+pb$'th -digit since that digit is assumed to be zero at this point. However, if $ix + pb \ge digs$ the carry is not set as it would make the result -exceed the precision requested. - -\index{bn\_s\_mp\_mul\_digs.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_mul\_digs.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* multiplies |a| * |b| and only computes upto digs digits of result -018 * HAC pp. 595, Algorithm 14.12 Modified so you can control how -019 * many digits of output are created. -020 */ -021 int -022 s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -023 \{ -024 mp_int t; -025 int res, pa, pb, ix, iy; -026 mp_digit u; -027 mp_word r; -028 mp_digit tmpx, *tmpt, *tmpy; -029 -030 /* can we use the fast multiplier? */ -031 if (((digs) < MP_WARRAY) && -032 MIN (a->used, b->used) < -033 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ -034 return fast_s_mp_mul_digs (a, b, c, digs); -035 \} -036 -037 if ((res = mp_init_size (&t, digs)) != MP_OKAY) \{ -038 return res; -039 \} -040 t.used = digs; -041 -042 /* compute the digits of the product directly */ -043 pa = a->used; -044 for (ix = 0; ix < pa; ix++) \{ -045 /* set the carry to zero */ -046 u = 0; -047 -048 /* limit ourselves to making digs digits of output */ -049 pb = MIN (b->used, digs - ix); -050 -051 /* setup some aliases */ -052 /* copy of the digit from a used within the nested loop */ -053 tmpx = a->dp[ix]; -054 -055 /* an alias for the destination shifted ix places */ -056 tmpt = t.dp + ix; -057 -058 /* an alias for the digits of b */ -059 tmpy = b->dp; -060 -061 /* compute the columns of the output and propagate the carry */ -062 for (iy = 0; iy < pb; iy++) \{ -063 /* compute the column as a mp_word */ -064 r = ((mp_word) *tmpt) + -065 ((mp_word) tmpx) * ((mp_word) * tmpy++) + -066 ((mp_word) u); -067 -068 /* the new column is the lower part of the result */ -069 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); -070 -071 /* get the carry word from the result */ -072 u = (mp_digit) (r >> ((mp_word) DIGIT_BIT)); -073 \} -074 /* set carry if it is placed below digs */ -075 if (ix + iy < digs) \{ -076 *tmpt = u; -077 \} -078 \} -079 -080 mp_clamp (&t); -081 mp_exch (&t, c); -082 -083 mp_clear (&t); -084 return MP_OKAY; -085 \} -\end{alltt} -\end{small} - -Lines 31 to 35 determine if the Comba method can be used first. The conditions for using the Comba routine are that min$(a.used, b.used) < \delta$ and -the number of digits of output is less than \textbf{MP\_WARRAY}. This new constant is used to control -the stack usage in the Comba routines. By default it is set to $\delta$ but can be reduced when memory is at a premium. - -Of particular importance is the calculation of the $ix+iy$'th column on lines 64, 65 and 66. Note how all of the -variables are cast to the type \textbf{mp\_word}, which is also the type of variable $\hat r$. That is to ensure that double precision operations -are used instead of single precision. The multiplication on line 65 makes use of a specific GCC optimizer behaviour. On the outset it looks like -the compiler will have to use a double precision multiplication to produce the result required. Such an operation would be horribly slow on most -processors and drag this to a crawl. However, GCC is smart enough to realize that double wide output single precision multipliers can be used. For -example, the instruction ``MUL'' on the x86 processor can multiply two 32-bit values and produce a 64-bit result. - -\subsection{Faster Multiplication by the ``Comba'' Method} - -One of the huge drawbacks of the ``baseline'' algorithms is that at the $O(n^2)$ level the carry must be computed and propagated upwards. This -makes the nested loop very sequential and hard to unroll and implement in parallel. The ``Comba'' \cite{COMBA} method is named after little known -(\textit{in cryptographic venues}) Paul G. Comba who described a method of implementing fast multipliers that do not require nested -carry fixup operations. As an interesting aside it seems that Paul Barrett describes a similar technique in -his 1986 paper \cite{BARRETT} written five years before. - -At the heart of the Comba technique is once again the long-hand algorithm. Except in this case a slight twist is placed on how -the columns of the result are produced. In the standard long-hand algorithm rows of products are produced then added together to form the -final result. In the baseline algorithm the columns are added together after each iteration to get the result instantaneously. - -In the Comba algorithm the columns of the result are produced entirely independently of each other. That is at the $O(n^2)$ level a -simple multiplication and addition step is performed. The carries of the columns are propagated after the nested loop to reduce the amount -of work requiored. Succintly the first step of the algorithm is to compute the product vector $\vec x$ as follows. - -\begin{equation} -\vec x_n = \sum_{i+j = n} a_ib_j, \forall n \in \lbrace 0, 1, 2, \ldots, i + j \rbrace -\end{equation} - -Where $\vec x_n$ is the $n'th$ column of the output vector. Consider the following example which computes the vector $\vec x$ for the multiplication -of $576$ and $241$. - -\newpage\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|c|c|c|c|c|} - \hline & & 5 & 7 & 6 & First Input\\ - \hline $\times$ & & 2 & 4 & 1 & Second Input\\ -\hline & & $1 \cdot 5 = 5$ & $1 \cdot 7 = 7$ & $1 \cdot 6 = 6$ & First pass \\ - & $4 \cdot 5 = 20$ & $4 \cdot 7+5=33$ & $4 \cdot 6+7=31$ & 6 & Second pass \\ - $2 \cdot 5 = 10$ & $2 \cdot 7 + 20 = 34$ & $2 \cdot 6+33=45$ & 31 & 6 & Third pass \\ -\hline 10 & 34 & 45 & 31 & 6 & Final Result \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Comba Multiplication Diagram} -\end{figure} - -At this point the vector $x = \left < 10, 34, 45, 31, 6 \right >$ is the result of the first step of the Comba multipler. -Now the columns must be fixed by propagating the carry upwards. The resultant vector will have one extra dimension over the input vector which is -congruent to adding a leading zero digit. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Comba Fixup}. \\ -\textbf{Input}. Vector $\vec x$ of dimension $k$ \\ -\textbf{Output}. Vector $\vec x$ such that the carries have been propagated. \\ -\hline \\ -1. for $n$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 $\vec x_{n+1} \leftarrow \vec x_{n+1} + \lfloor \vec x_{n}/\beta \rfloor$ \\ -\hspace{3mm}1.2 $\vec x_{n} \leftarrow \vec x_{n} \mbox{ (mod }\beta\mbox{)}$ \\ -2. Return($\vec x$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Comba Fixup} -\end{figure} - -With that algorithm and $k = 5$ and $\beta = 10$ the following vector is produced $\vec x= \left < 1, 3, 8, 8, 1, 6 \right >$. In this case -$241 \cdot 576$ is in fact $138816$ and the procedure succeeded. If the algorithm is correct and as will be demonstrated shortly more -efficient than the baseline algorithm why not simply always use this algorithm? - -\subsubsection{Column Weight.} -At the nested $O(n^2)$ level the Comba method adds the product of two single precision variables to each column of the output -independently. A serious obstacle is if the carry is lost, due to lack of precision before the algorithm has a chance to fix -the carries. For example, in the multiplication of two three-digit numbers the third column of output will be the sum of -three single precision multiplications. If the precision of the accumulator for the output digits is less then $3 \cdot (\beta - 1)^2$ then -an overflow can occur and the carry information will be lost. For any $m$ and $n$ digit inputs the maximum weight of any column is -min$(m, n)$ which is fairly obvious. - -The maximum number of terms in any column of a product is known as the ``column weight'' and strictly governs when the algorithm can be used. Recall -from earlier that a double precision type has $\alpha$ bits of resolution and a single precision digit has $lg(\beta)$ bits of precision. Given these -two quantities we must not violate the following - -\begin{equation} -k \cdot \left (\beta - 1 \right )^2 < 2^{\alpha} -\end{equation} - -Which reduces to - -\begin{equation} -k \cdot \left ( \beta^2 - 2\beta + 1 \right ) < 2^{\alpha} -\end{equation} - -Let $\rho = lg(\beta)$ represent the number of bits in a single precision digit. By further re-arrangement of the equation the final solution is -found. - -\begin{equation} -k < {{2^{\alpha}} \over {\left (2^{2\rho} - 2^{\rho + 1} + 1 \right )}} -\end{equation} - -The defaults for LibTomMath are $\beta = 2^{28}$ and $\alpha = 2^{64}$ which means that $k$ is bounded by $k < 257$. In this configuration -the smaller input may not have more than $256$ digits if the Comba method is to be used. This is quite satisfactory for most applications since -$256$ digits would allow for numbers in the range of $0 \le x < 2^{7168}$ which, is much larger than most public key cryptographic algorithms require. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_s\_mp\_mul\_digs}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and an integer $digs$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert \mbox{ (mod }\beta^{digs}\mbox{)}$. \\ -\hline \\ -Place an array of \textbf{MP\_WARRAY} double precision digits named $\hat W$ on the stack. \\ -1. If $c.alloc < digs$ then grow $c$ to $digs$ digits. (\textit{mp\_grow}) \\ -2. If step 1 failed return(\textit{MP\_MEM}).\\ -\\ -Zero the temporary array $\hat W$. \\ -3. for $n$ from $0$ to $digs - 1$ do \\ -\hspace{3mm}3.1 $\hat W_n \leftarrow 0$ \\ -\\ -Compute the columns. \\ -4. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}4.1 $pb \leftarrow \mbox{min}(b.used, digs - ix)$ \\ -\hspace{3mm}4.2 If $pb < 1$ then goto step 5. \\ -\hspace{3mm}4.3 for $iy$ from $0$ to $pb - 1$ do \\ -\hspace{6mm}4.3.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}b_{iy}$ \\ -\\ -Propagate the carries upwards. \\ -5. $oldused \leftarrow c.used$ \\ -6. $c.used \leftarrow digs$ \\ -7. If $digs > 1$ then do \\ -\hspace{3mm}7.1. for $ix$ from $1$ to $digs - 1$ do \\ -\hspace{6mm}7.1.1 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix-1} / \beta \rfloor$ \\ -\hspace{6mm}7.1.2 $c_{ix - 1} \leftarrow \hat W_{ix - 1} \mbox{ (mod }\beta\mbox{)}$ \\ -8. else do \\ -\hspace{3mm}8.1 $ix \leftarrow 0$ \\ -9. $c_{ix} \leftarrow \hat W_{ix} \mbox{ (mod }\beta\mbox{)}$ \\ -\\ -Zero excess digits. \\ -10. If $digs < oldused$ then do \\ -\hspace{3mm}10.1 for $n$ from $digs$ to $oldused - 1$ do \\ -\hspace{6mm}10.1.1 $c_n \leftarrow 0$ \\ -11. Clamp excessive digits of $c$. (\textit{mp\_clamp}) \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_s\_mp\_mul\_digs} -\label{fig:COMBAMULT} -\end{figure} - -\textbf{Algorithm fast\_s\_mp\_mul\_digs.} -This algorithm performs the unsigned multiplication of $a$ and $b$ using the Comba method limited to $digs$ digits of precision. The algorithm -essentially peforms the same calculation as algorithm s\_mp\_mul\_digs, just much faster. - -The array $\hat W$ is meant to be on the stack when the algorithm is used. The size of the array does not change which is ideal. Note also that -unlike algorithm s\_mp\_mul\_digs no temporary mp\_int is required since the result is calculated directly in $\hat W$. - -The $O(n^2)$ loop on step four is where the Comba method's advantages begin to show through in comparison to the baseline algorithm. The lack of -a carry variable or propagation in this loop allows the loop to be performed with only single precision multiplication and additions. Now that each -iteration of the inner loop can be performed independent of the others the inner loop can be performed with a high level of parallelism. - -To measure the benefits of the Comba method over the baseline method consider the number of operations that are required. If the -cost in terms of time of a multiply and addition is $p$ and the cost of a carry propagation is $q$ then a baseline multiplication would require -$O \left ((p + q)n^2 \right )$ time to multiply two $n$-digit numbers. The Comba method requires only $O(pn^2 + qn)$ time, however in practice, -the speed increase is actually much more. With $O(n)$ space the algorithm can be reduced to $O(pn + qn)$ time by implementing the $n$ multiply -and addition operations in the nested loop in parallel. - -\index{bn\_fast\_s\_mp\_mul\_digs.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_mul\_digs.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* Fast (comba) multiplier -018 * -019 * This is the fast column-array [comba] multiplier. It is -020 * designed to compute the columns of the product first -021 * then handle the carries afterwards. This has the effect -022 * of making the nested loops that compute the columns very -023 * simple and schedulable on super-scalar processors. -024 * -025 * This has been modified to produce a variable number of -026 * digits of output so if say only a half-product is required -027 * you don't have to compute the upper half (a feature -028 * required for fast Barrett reduction). -029 * -030 * Based on Algorithm 14.12 on pp.595 of HAC. -031 * -032 */ -033 int -034 fast_s_mp_mul_digs (mp_int * a, mp_int * b, mp_int * c, int digs) -035 \{ -036 int olduse, res, pa, ix; -037 mp_word W[MP_WARRAY]; -038 -039 /* grow the destination as required */ -040 if (c->alloc < digs) \{ -041 if ((res = mp_grow (c, digs)) != MP_OKAY) \{ -042 return res; -043 \} -044 \} -045 -046 /* clear temp buf (the columns) */ -047 memset (W, 0, sizeof (mp_word) * digs); -048 -049 /* calculate the columns */ -050 pa = a->used; -051 for (ix = 0; ix < pa; ix++) \{ -052 /* this multiplier has been modified to allow you to -053 * control how many digits of output are produced. -054 * So at most we want to make upto "digs" digits of output. -055 * -056 * this adds products to distinct columns (at ix+iy) of W -057 * note that each step through the loop is not dependent on -058 * the previous which means the compiler can easily unroll -059 * the loop without scheduling problems -060 */ -061 \{ -062 register mp_digit tmpx, *tmpy; -063 register mp_word *_W; -064 register int iy, pb; -065 -066 /* alias for the the word on the left e.g. A[ix] * A[iy] */ -067 tmpx = a->dp[ix]; -068 -069 /* alias for the right side */ -070 tmpy = b->dp; -071 -072 /* alias for the columns, each step through the loop adds a new -073 term to each column -074 */ -075 _W = W + ix; -076 -077 /* the number of digits is limited by their placement. E.g. -078 we avoid multiplying digits that will end up above the # of -079 digits of precision requested -080 */ -081 pb = MIN (b->used, digs - ix); -082 -083 for (iy = 0; iy < pb; iy++) \{ -084 *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); -085 \} -086 \} -087 -088 \} -089 -090 /* setup dest */ -091 olduse = c->used; -092 c->used = digs; -093 -094 \{ -095 register mp_digit *tmpc; -096 -097 /* At this point W[] contains the sums of each column. To get the -098 * correct result we must take the extra bits from each column and -099 * carry them down -100 * -101 * Note that while this adds extra code to the multiplier it -102 * saves time since the carry propagation is removed from the -103 * above nested loop.This has the effect of reducing the work -104 * from N*(N+N*c)==N**2 + c*N**2 to N**2 + N*c where c is the -105 * cost of the shifting. On very small numbers this is slower -106 * but on most cryptographic size numbers it is faster. -107 */ -108 tmpc = c->dp; -109 for (ix = 1; ix < digs; ix++) \{ -110 W[ix] += (W[ix - 1] >> ((mp_word) DIGIT_BIT)); -111 *tmpc++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); -112 \} -113 *tmpc++ = (mp_digit) (W[digs - 1] & ((mp_word) MP_MASK)); -114 -115 /* clear unused */ -116 for (; ix < olduse; ix++) \{ -117 *tmpc++ = 0; -118 \} -119 \} -120 -121 mp_clamp (c); -122 return MP_OKAY; -123 \} -\end{alltt} -\end{small} - -The memset on line 47 clears the initial $\hat W$ array to zero in a single step. Like the slower baseline multiplication -implementation a series of aliases (\textit{lines 67, 70 and 75}) are used to simplify the inner $O(n^2)$ loop. -In this case a new alias $\_\hat W$ has been added which refers to the double precision columns offset by $ix$ in each pass. - -The inner loop on lines 83, 84 and 85 is where the algorithm will spend the majority of the time, which is why it has been -stripped to the bones of any extra baggage\footnote{Hence the pointer aliases.}. On x86 processors the multiplication and additions amount to at the -very least five instructions (\textit{two loads, two additions, one multiply}) while on the ARMv4 processors they amount to only three -(\textit{one load, one store, one multiply-add}). For both of the x86 and ARMv4 processors the GCC compiler performs a good job at unrolling the loop -and scheduling the instructions so there are very few dependency stalls. - -In theory the difference between the baseline and comba algorithms is a mere $O(qn)$ time difference. However, in the $O(n^2)$ nested loop of the -baseline method there are dependency stalls as the algorithm must wait for the multiplier to finish before propagating the carry to the next -digit. As a result fewer of the often multiple execution units\footnote{The AMD Athlon has three execution units and the Intel P4 has four.} can -be simultaneously used. - -\subsection{Polynomial Basis Multiplication} -To break the $O(n^2)$ barrier in multiplication requires a completely different look at integer multiplication. In the following algorithms -the use of polynomial basis representation for two integers $a$ and $b$ as $f(x) = \sum_{i=0}^{n} a_i x^i$ and -$g(x) = \sum_{i=0}^{n} b_i x^i$ respectively, is required. In this system both $f(x)$ and $g(x)$ have $n + 1$ terms and are of the $n$'th degree. - -The product $a \cdot b \equiv f(x)g(x)$ is the polynomial $W(x) = \sum_{i=0}^{2n} w_i x^i$. The coefficients $w_i$ will -directly yield the desired product when $\beta$ is substituted for $x$. The direct solution to solve for the $2n + 1$ coefficients -requires $O(n^2)$ time and would in practice be slower than the Comba technique. - -However, numerical analysis theory indicates that only $2n + 1$ distinct points in $W(x)$ are required to determine the values of the $2n + 1$ unknown -coefficients. This means by finding $\zeta_y = W(y)$ for $2n + 1$ small values of $y$ the coefficients of $W(x)$ can be found with -Gaussian elimination. This technique is also occasionally refered to as the \textit{interpolation technique} (\textit{references please...}) since in -effect an interpolation based on $2n + 1$ points will yield a polynomial equivalent to $W(x)$. - -The coefficients of the polynomial $W(x)$ are unknown which makes finding $W(y)$ for any value of $y$ impossible. However, since -$W(x) = f(x)g(x)$ the equivalent $\zeta_y = f(y) g(y)$ can be used in its place. The benefit of this technique stems from the -fact that $f(y)$ and $g(y)$ are much smaller than either $a$ or $b$ respectively. As a result finding the $2n + 1$ relations required -by multiplying $f(y)g(y)$ involves multiplying integers that are much smaller than either of the inputs. - -When picking points to gather relations there are always three obvious points to choose, $y = 0, 1$ and $ \infty$. The $\zeta_0$ term -is simply the product $W(0) = w_0 = a_0 \cdot b_0$. The $\zeta_1$ term is the product -$W(1) = \left (\sum_{i = 0}^{n} a_i \right ) \left (\sum_{i = 0}^{n} b_i \right )$. The third point $\zeta_{\infty}$ is less obvious but rather -simple to explain. The $2n + 1$'th coefficient of $W(x)$ is numerically equivalent to the most significant column in an integer multiplication. -The point at $\infty$ is used symbolically to represent the most significant column, that is $W(\infty) = w_{2n} = a_nb_n$. Note that the -points at $y = 0$ and $\infty$ yield the coefficients $w_0$ and $w_{2n}$ directly. - -If more points are required they should be of small values and powers of two such as $2^q$ and the related \textit{mirror points} -$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ for small values of $q$. The term ``mirror point'' stems from the fact that -$\left (2^q \right )^{2n} \cdot \zeta_{2^{-q}}$ can be calculated in the exact opposite fashion as $\zeta_{2^q}$. For -example, when $n = 2$ and $q = 1$ then following two equations are equivalent to the point $\zeta_{2}$ and its mirror. - -\begin{eqnarray} -\zeta_{2} = f(2)g(2) = (4a_2 + 2a_1 + a_0)(4b_2 + 2b_1 + b_0) \nonumber \\ -16 \cdot \zeta_{1 \over 2} = 4f({1\over 2}) \cdot 4g({1 \over 2}) = (a_2 + 2a_1 + 4a_0)(b_2 + 2b_1 + 4b_0) -\end{eqnarray} - -Using such points will allow the values of $f(y)$ and $g(y)$ to be independently calculated using only left shifts. For example, when $n = 2$ the -polynomial $f(2^q)$ is equal to $2^q((2^qa_2) + a_1) + a_0$. This technique of polynomial representation is known as Horner's method. - -As a general rule of the algorithm when the inputs are split into $n$ parts each there are $2n - 1$ multiplications. Each multiplication is of -multiplicands that have $n$ times fewer digits than the inputs. The asymptotic running time of this algorithm is -$O \left ( k^{lg_n(2n - 1)} \right )$ for $k$ digit inputs (\textit{assuming they have the same number of digits}). Figure~\ref{fig:exponent} -summarizes the exponents for various values of $n$. - -\begin{figure} -\begin{center} -\begin{tabular}{|c|c|c|} -\hline \textbf{Split into $n$ Parts} & \textbf{Exponent} & \textbf{Notes}\\ -\hline $2$ & $1.584962501$ & This is Karatsuba Multiplication. \\ -\hline $3$ & $1.464973520$ & This is Toom-Cook Multiplication. \\ -\hline $4$ & $1.403677461$ &\\ -\hline $5$ & $1.365212389$ &\\ -\hline $10$ & $1.278753601$ &\\ -\hline $100$ & $1.149426538$ &\\ -\hline $1000$ & $1.100270931$ &\\ -\hline $10000$ & $1.075252070$ &\\ -\hline -\end{tabular} -\end{center} -\caption{Asymptotic Running Time of Polynomial Basis Multiplication} -\label{fig:exponent} -\end{figure} - -At first it may seem like a good idea to choose $n = 1000$ since the exponent is approximately $1.1$. However, the overhead -of solving for the 2001 terms of $W(x)$ will certainly consume any savings the algorithm could offer for all but exceedingly large -numbers. - -\subsubsection{Cutoff Point} -The polynomial basis multiplication algorithms all require fewer single precision multiplications than a straight Comba approach. However, -the algorithms incur an overhead (\textit{at the $O(n)$ work level}) since they require a system of equations to be solved. This makes the -polynomial basis approach more costly to use with small inputs. - -Let $m$ represent the number of digits in the multiplicands (\textit{assume both multiplicands have the same number of digits}). There exists a -point $y$ such that when $m < y$ the polynomial basis algorithms are more costly than Comba, when $m = y$ they are roughly the same cost and -when $m > y$ the Comba methods are slower than the polynomial basis algorithms. - -The exact location of $y$ depends on several key architectural elements of the computer platform in question. - -\begin{enumerate} -\item The ratio of clock cycles for single precision multiplication versus other simpler operations such as addition, shifting, etc. For example -on the AMD Athlon the ratio is roughly $17 : 1$ while on the Intel P4 it is $29 : 1$. The higher the ratio in favour of multiplication the lower -the cutoff point $y$ will be. - -\item The complexity of the linear system of equations (\textit{for the coefficients of $W(x)$}) is. Generally speaking as the number of splits -grows the complexity grows substantially. Ideally solving the system will only involve addition, subtraction and shifting of integers. This -directly reflects on the ratio previous mentioned. - -\item To a lesser extent memory bandwidth and function call overheads. Provided the values are in the processor cache this is less of an -influence over the cutoff point. - -\end{enumerate} - -A clean cutoff point separation occurs when a point $y$ is found such that all of the cutoff point conditions are met. For example, if the point -is too low then there will be values of $m$ such that $m > y$ and the Comba method is still faster. Finding the cutoff points is fairly simple when -a high resolution timer is available. - -\subsection{Karatsuba Multiplication} -Karatsuba \cite{KARA} multiplication when originally proposed in 1962 was among the first set of algorithms to break the $O(n^2)$ barrier for -general purpose multiplication. Given two polynomial basis representations $f(x) = ax + b$ and $g(x) = cx + d$, Karatsuba proved with -light algebra \cite{KARAP} that the following polynomial is equivalent to multiplication of the two integers the polynomials represent. - -\begin{equation} -f(x) \cdot g(x) = acx^2 + ((a - b)(c - d) + ac + bd)x + bd -\end{equation} - -Using the observation that $ac$ and $bd$ could be re-used only three half sized multiplications would be required to produce the product. Applying -this algorithm recursively, the work factor becomes $O(n^{lg(3)})$ which is substantially better than the work factor $O(n^2)$ of the Comba technique. It turns -out what Karatsuba did not know or at least did not publish was that this is simply polynomial basis multiplication with the points -$\zeta_0$, $\zeta_{\infty}$ and $-\zeta_{-1}$. Consider the resultant system of equations. - -\begin{center} -\begin{tabular}{rcrcrcrc} -$\zeta_{0}$ & $=$ & & & & & $w_0$ \\ -$-\zeta_{-1}$ & $=$ & $-w_2$ & $+$ & $w_1$ & $-$ & $w_0$ \\ -$\zeta_{\infty}$ & $=$ & $w_2$ & & & & \\ -\end{tabular} -\end{center} - -By adding the first and last equation to the equation in the middle the term $w_1$ can be isolated and all three coefficients solved for. The simplicity -of this system of equations has made Karatsuba fairly popular. In fact the cutoff point is often fairly low\footnote{With LibTomMath 0.18 it is 70 and 109 digits for the Intel P4 and AMD Athlon respectively.} -making it an ideal algorithm to speed up certain public key cryptosystems such as RSA and Diffie-Hellman. It is worth noting that the point -$\zeta_1$ could be substituted for $-\zeta_{-1}$. In this case the first and third row are subtracted instead of added to the second row. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_karatsuba\_mul}. \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow \vert a \vert \cdot \vert b \vert$ \\ -\hline \\ -1. Init the following mp\_int variables: $x0$, $x1$, $y0$, $y1$, $t1$, $x0y0$, $x1y1$.\\ -2. If step 2 failed then return(\textit{MP\_MEM}). \\ -\\ -Split the input. e.g. $a = x1 \cdot \beta^B + x0$ \\ -3. $B \leftarrow \mbox{min}(a.used, b.used)/2$ \\ -4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -5. $y0 \leftarrow b \mbox{ (mod }\beta^B\mbox{)}$ \\ -6. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_rshd}) \\ -7. $y1 \leftarrow \lfloor b / \beta^B \rfloor$ \\ -\\ -Calculate the three products. \\ -8. $x0y0 \leftarrow x0 \cdot y0$ (\textit{mp\_mul}) \\ -9. $x1y1 \leftarrow x1 \cdot y1$ \\ -10. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ -11. $x0 \leftarrow y1 - y0$ \\ -12. $t1 \leftarrow t1 \cdot x0$ \\ -\\ -Calculate the middle term. \\ -13. $x0 \leftarrow x0y0 + x1y1$ \\ -14. $t1 \leftarrow x0 - t1$ \\ -\\ -Calculate the final product. \\ -15. $t1 \leftarrow t1 \cdot \beta^B$ (\textit{mp\_lshd}) \\ -16. $x1y1 \leftarrow x1y1 \cdot \beta^{2B}$ \\ -17. $t1 \leftarrow x0y0 + t1$ \\ -18. $c \leftarrow t1 + x1y1$ \\ -19. Clear all of the temporary variables. \\ -20. Return(\textit{MP\_OKAY}).\\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_karatsuba\_mul} -\end{figure} - -\textbf{Algorithm mp\_karatsuba\_mul.} -This algorithm computes the unsigned product of two inputs using the Karatsuba multiplication algorithm. It is loosely based on the description -from Knuth \cite[pp. 294-295]{TAOCPV2}. - -\index{radix point} -In order to split the two inputs into their respective halves, a suitable \textit{radix point} must be chosen. The radix point chosen must -be used for both of the inputs meaning that it must be smaller than the smallest input. Step 3 chooses the radix point $B$ as half of the -smallest input \textbf{used} count. After the radix point is chosen the inputs are split into lower and upper halves. Step 4 and 5 -compute the lower halves. Step 6 and 7 computer the upper halves. - -After the halves have been computed the three intermediate half-size products must be computed. Step 8 and 9 compute the trivial products -$x0 \cdot y0$ and $x1 \cdot y1$. The mp\_int $x0$ is used as a temporary variable after $x1 - x0$ has been computed. By using $x0$ instead -of an additional temporary variable, the algorithm can avoid an addition memory allocation operation. - -The remaining steps 13 through 18 compute the Karatsuba polynomial through a variety of digit shifting and addition operations. - -\index{bn\_mp\_karatsuba\_mul.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_mul.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* c = |a| * |b| using Karatsuba Multiplication using -018 * three half size multiplications -019 * -020 * Let B represent the radix [e.g. 2**DIGIT_BIT] and -021 * let n represent half of the number of digits in -022 * the min(a,b) -023 * -024 * a = a1 * B**n + a0 -025 * b = b1 * B**n + b0 -026 * -027 * Then, a * b => -028 a1b1 * B**2n + ((a1 - a0)(b1 - b0) + a0b0 + a1b1) * B + a0b0 -029 * -030 * Note that a1b1 and a0b0 are used twice and only need to be -031 * computed once. So in total three half size (half # of -032 * digit) multiplications are performed, a0b0, a1b1 and -033 * (a1-b1)(a0-b0) -034 * -035 * Note that a multiplication of half the digits requires -036 * 1/4th the number of single precision multiplications so in -037 * total after one call 25% of the single precision multiplications -038 * are saved. Note also that the call to mp_mul can end up back -039 * in this function if the a0, a1, b0, or b1 are above the threshold. -040 * This is known as divide-and-conquer and leads to the famous -041 * O(N**lg(3)) or O(N**1.584) work which is asymptopically lower than -042 * the standard O(N**2) that the baseline/comba methods use. -043 * Generally though the overhead of this method doesn't pay off -044 * until a certain size (N ~ 80) is reached. -045 */ -046 int -047 mp_karatsuba_mul (mp_int * a, mp_int * b, mp_int * c) -048 \{ -049 mp_int x0, x1, y0, y1, t1, x0y0, x1y1; -050 int B, err; -051 -052 /* default the return code to an error */ -053 err = MP_MEM; -054 -055 /* min # of digits */ -056 B = MIN (a->used, b->used); -057 -058 /* now divide in two */ -059 B = B / 2; -060 -061 /* init copy all the temps */ -062 if (mp_init_size (&x0, B) != MP_OKAY) -063 goto ERR; -064 if (mp_init_size (&x1, a->used - B) != MP_OKAY) -065 goto X0; -066 if (mp_init_size (&y0, B) != MP_OKAY) -067 goto X1; -068 if (mp_init_size (&y1, b->used - B) != MP_OKAY) -069 goto Y0; -070 -071 /* init temps */ -072 if (mp_init_size (&t1, B * 2) != MP_OKAY) -073 goto Y1; -074 if (mp_init_size (&x0y0, B * 2) != MP_OKAY) -075 goto T1; -076 if (mp_init_size (&x1y1, B * 2) != MP_OKAY) -077 goto X0Y0; -078 -079 /* now shift the digits */ -080 x0.sign = x1.sign = a->sign; -081 y0.sign = y1.sign = b->sign; -082 -083 x0.used = y0.used = B; -084 x1.used = a->used - B; -085 y1.used = b->used - B; -086 -087 \{ -088 register int x; -089 register mp_digit *tmpa, *tmpb, *tmpx, *tmpy; -090 -091 /* we copy the digits directly instead of using higher level functions -092 * since we also need to shift the digits -093 */ -094 tmpa = a->dp; -095 tmpb = b->dp; -096 -097 tmpx = x0.dp; -098 tmpy = y0.dp; -099 for (x = 0; x < B; x++) \{ -100 *tmpx++ = *tmpa++; -101 *tmpy++ = *tmpb++; -102 \} -103 -104 tmpx = x1.dp; -105 for (x = B; x < a->used; x++) \{ -106 *tmpx++ = *tmpa++; -107 \} -108 -109 tmpy = y1.dp; -110 for (x = B; x < b->used; x++) \{ -111 *tmpy++ = *tmpb++; -112 \} -113 \} -114 -115 /* only need to clamp the lower words since by definition the -116 * upper words x1/y1 must have a known number of digits -117 */ -118 mp_clamp (&x0); -119 mp_clamp (&y0); -120 -121 /* now calc the products x0y0 and x1y1 */ -122 /* after this x0 is no longer required, free temp [x0==t2]! */ -123 if (mp_mul (&x0, &y0, &x0y0) != MP_OKAY) -124 goto X1Y1; /* x0y0 = x0*y0 */ -125 if (mp_mul (&x1, &y1, &x1y1) != MP_OKAY) -126 goto X1Y1; /* x1y1 = x1*y1 */ -127 -128 /* now calc x1-x0 and y1-y0 */ -129 if (mp_sub (&x1, &x0, &t1) != MP_OKAY) -130 goto X1Y1; /* t1 = x1 - x0 */ -131 if (mp_sub (&y1, &y0, &x0) != MP_OKAY) -132 goto X1Y1; /* t2 = y1 - y0 */ -133 if (mp_mul (&t1, &x0, &t1) != MP_OKAY) -134 goto X1Y1; /* t1 = (x1 - x0) * (y1 - y0) */ -135 -136 /* add x0y0 */ -137 if (mp_add (&x0y0, &x1y1, &x0) != MP_OKAY) -138 goto X1Y1; /* t2 = x0y0 + x1y1 */ -139 if (mp_sub (&x0, &t1, &t1) != MP_OKAY) -140 goto X1Y1; /* t1 = x0y0 + x1y1 - (x1-x0)*(y1-y0) */ -141 -142 /* shift by B */ -143 if (mp_lshd (&t1, B) != MP_OKAY) -144 goto X1Y1; /* t1 = (x0y0 + x1y1 - (x1-x0)*(y1-y0))<used, b->used) / 3; -033 -034 /* a = a2 * B**2 + a1 * B + a0 */ -035 if ((res = mp_mod_2d(a, DIGIT_BIT * B, &a0)) != MP_OKAY) \{ -036 goto ERR; -037 \} -038 -039 if ((res = mp_copy(a, &a1)) != MP_OKAY) \{ -040 goto ERR; -041 \} -042 mp_rshd(&a1, B); -043 mp_mod_2d(&a1, DIGIT_BIT * B, &a1); -044 -045 if ((res = mp_copy(a, &a2)) != MP_OKAY) \{ -046 goto ERR; -047 \} -048 mp_rshd(&a2, B*2); -049 -050 /* b = b2 * B**2 + b1 * B + b0 */ -051 if ((res = mp_mod_2d(b, DIGIT_BIT * B, &b0)) != MP_OKAY) \{ -052 goto ERR; -053 \} -054 -055 if ((res = mp_copy(b, &b1)) != MP_OKAY) \{ -056 goto ERR; -057 \} -058 mp_rshd(&b1, B); -059 mp_mod_2d(&b1, DIGIT_BIT * B, &b1); -060 -061 if ((res = mp_copy(b, &b2)) != MP_OKAY) \{ -062 goto ERR; -063 \} -064 mp_rshd(&b2, B*2); -065 -066 /* w0 = a0*b0 */ -067 if ((res = mp_mul(&a0, &b0, &w0)) != MP_OKAY) \{ -068 goto ERR; -069 \} -070 -071 /* w4 = a2 * b2 */ -072 if ((res = mp_mul(&a2, &b2, &w4)) != MP_OKAY) \{ -073 goto ERR; -074 \} -075 -076 /* w1 = (a2 + 2(a1 + 2a0))(b2 + 2(b1 + 2b0)) */ -077 if ((res = mp_mul_2(&a0, &tmp1)) != MP_OKAY) \{ -078 goto ERR; -079 \} -080 if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{ -081 goto ERR; -082 \} -083 if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{ -084 goto ERR; -085 \} -086 if ((res = mp_add(&tmp1, &a2, &tmp1)) != MP_OKAY) \{ -087 goto ERR; -088 \} -089 -090 if ((res = mp_mul_2(&b0, &tmp2)) != MP_OKAY) \{ -091 goto ERR; -092 \} -093 if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{ -094 goto ERR; -095 \} -096 if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{ -097 goto ERR; -098 \} -099 if ((res = mp_add(&tmp2, &b2, &tmp2)) != MP_OKAY) \{ -100 goto ERR; -101 \} -102 -103 if ((res = mp_mul(&tmp1, &tmp2, &w1)) != MP_OKAY) \{ -104 goto ERR; -105 \} -106 -107 /* w3 = (a0 + 2(a1 + 2a2))(b0 + 2(b1 + 2b2)) */ -108 if ((res = mp_mul_2(&a2, &tmp1)) != MP_OKAY) \{ -109 goto ERR; -110 \} -111 if ((res = mp_add(&tmp1, &a1, &tmp1)) != MP_OKAY) \{ -112 goto ERR; -113 \} -114 if ((res = mp_mul_2(&tmp1, &tmp1)) != MP_OKAY) \{ -115 goto ERR; -116 \} -117 if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{ -118 goto ERR; -119 \} -120 -121 if ((res = mp_mul_2(&b2, &tmp2)) != MP_OKAY) \{ -122 goto ERR; -123 \} -124 if ((res = mp_add(&tmp2, &b1, &tmp2)) != MP_OKAY) \{ -125 goto ERR; -126 \} -127 if ((res = mp_mul_2(&tmp2, &tmp2)) != MP_OKAY) \{ -128 goto ERR; -129 \} -130 if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{ -131 goto ERR; -132 \} -133 -134 if ((res = mp_mul(&tmp1, &tmp2, &w3)) != MP_OKAY) \{ -135 goto ERR; -136 \} -137 -138 -139 /* w2 = (a2 + a1 + a0)(b2 + b1 + b0) */ -140 if ((res = mp_add(&a2, &a1, &tmp1)) != MP_OKAY) \{ -141 goto ERR; -142 \} -143 if ((res = mp_add(&tmp1, &a0, &tmp1)) != MP_OKAY) \{ -144 goto ERR; -145 \} -146 if ((res = mp_add(&b2, &b1, &tmp2)) != MP_OKAY) \{ -147 goto ERR; -148 \} -149 if ((res = mp_add(&tmp2, &b0, &tmp2)) != MP_OKAY) \{ -150 goto ERR; -151 \} -152 if ((res = mp_mul(&tmp1, &tmp2, &w2)) != MP_OKAY) \{ -153 goto ERR; -154 \} -155 -156 /* now solve the matrix -157 -158 0 0 0 0 1 -159 1 2 4 8 16 -160 1 1 1 1 1 -161 16 8 4 2 1 -162 1 0 0 0 0 -163 -164 using 12 subtractions, 4 shifts, -165 2 small divisions and 1 small multiplication -166 */ -167 -168 /* r1 - r4 */ -169 if ((res = mp_sub(&w1, &w4, &w1)) != MP_OKAY) \{ -170 goto ERR; -171 \} -172 /* r3 - r0 */ -173 if ((res = mp_sub(&w3, &w0, &w3)) != MP_OKAY) \{ -174 goto ERR; -175 \} -176 /* r1/2 */ -177 if ((res = mp_div_2(&w1, &w1)) != MP_OKAY) \{ -178 goto ERR; -179 \} -180 /* r3/2 */ -181 if ((res = mp_div_2(&w3, &w3)) != MP_OKAY) \{ -182 goto ERR; -183 \} -184 /* r2 - r0 - r4 */ -185 if ((res = mp_sub(&w2, &w0, &w2)) != MP_OKAY) \{ -186 goto ERR; -187 \} -188 if ((res = mp_sub(&w2, &w4, &w2)) != MP_OKAY) \{ -189 goto ERR; -190 \} -191 /* r1 - r2 */ -192 if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{ -193 goto ERR; -194 \} -195 /* r3 - r2 */ -196 if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{ -197 goto ERR; -198 \} -199 /* r1 - 8r0 */ -200 if ((res = mp_mul_2d(&w0, 3, &tmp1)) != MP_OKAY) \{ -201 goto ERR; -202 \} -203 if ((res = mp_sub(&w1, &tmp1, &w1)) != MP_OKAY) \{ -204 goto ERR; -205 \} -206 /* r3 - 8r4 */ -207 if ((res = mp_mul_2d(&w4, 3, &tmp1)) != MP_OKAY) \{ -208 goto ERR; -209 \} -210 if ((res = mp_sub(&w3, &tmp1, &w3)) != MP_OKAY) \{ -211 goto ERR; -212 \} -213 /* 3r2 - r1 - r3 */ -214 if ((res = mp_mul_d(&w2, 3, &w2)) != MP_OKAY) \{ -215 goto ERR; -216 \} -217 if ((res = mp_sub(&w2, &w1, &w2)) != MP_OKAY) \{ -218 goto ERR; -219 \} -220 if ((res = mp_sub(&w2, &w3, &w2)) != MP_OKAY) \{ -221 goto ERR; -222 \} -223 /* r1 - r2 */ -224 if ((res = mp_sub(&w1, &w2, &w1)) != MP_OKAY) \{ -225 goto ERR; -226 \} -227 /* r3 - r2 */ -228 if ((res = mp_sub(&w3, &w2, &w3)) != MP_OKAY) \{ -229 goto ERR; -230 \} -231 /* r1/3 */ -232 if ((res = mp_div_3(&w1, &w1, NULL)) != MP_OKAY) \{ -233 goto ERR; -234 \} -235 /* r3/3 */ -236 if ((res = mp_div_3(&w3, &w3, NULL)) != MP_OKAY) \{ -237 goto ERR; -238 \} -239 -240 /* at this point shift W[n] by B*n */ -241 if ((res = mp_lshd(&w1, 1*B)) != MP_OKAY) \{ -242 goto ERR; -243 \} -244 if ((res = mp_lshd(&w2, 2*B)) != MP_OKAY) \{ -245 goto ERR; -246 \} -247 if ((res = mp_lshd(&w3, 3*B)) != MP_OKAY) \{ -248 goto ERR; -249 \} -250 if ((res = mp_lshd(&w4, 4*B)) != MP_OKAY) \{ -251 goto ERR; -252 \} -253 -254 if ((res = mp_add(&w0, &w1, c)) != MP_OKAY) \{ -255 goto ERR; -256 \} -257 if ((res = mp_add(&w2, &w3, &tmp1)) != MP_OKAY) \{ -258 goto ERR; -259 \} -260 if ((res = mp_add(&w4, &tmp1, &tmp1)) != MP_OKAY) \{ -261 goto ERR; -262 \} -263 if ((res = mp_add(&tmp1, c, c)) != MP_OKAY) \{ -264 goto ERR; -265 \} -266 -267 ERR: -268 mp_clear_multi(&w0, &w1, &w2, &w3, &w4, -269 &a0, &a1, &a2, &b0, &b1, -270 &b2, &tmp1, &tmp2, NULL); -271 return res; -272 \} -273 -\end{alltt} -\end{small} - --- Comments to be added during editing phase. - -\subsection{Signed Multiplication} -Now that algorithms to handle multiplications of every useful dimensions have been developed, a rather simple finishing touch is required. So far all -of the multiplication algorithms have been unsigned multiplications which leaves only a signed multiplication algorithm to be established. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_mul}. \\ -\textbf{Input}. mp\_int $a$ and mp\_int $b$ \\ -\textbf{Output}. $c \leftarrow a \cdot b$ \\ -\hline \\ -1. If $a.sign = b.sign$ then \\ -\hspace{3mm}1.1 $sign = MP\_ZPOS$ \\ -2. else \\ -\hspace{3mm}2.1 $sign = MP\_ZNEG$ \\ -3. If min$(a.used, b.used) \ge TOOM\_MUL\_CUTOFF$ then \\ -\hspace{3mm}3.1 $c \leftarrow a \cdot b$ using algorithm mp\_toom\_mul \\ -4. else if min$(a.used, b.used) \ge KARATSUBA\_MUL\_CUTOFF$ then \\ -\hspace{3mm}4.1 $c \leftarrow a \cdot b$ using algorithm mp\_karatsuba\_mul \\ -5. else \\ -\hspace{3mm}5.1 $digs \leftarrow a.used + b.used + 1$ \\ -\hspace{3mm}5.2 If $digs < MP\_ARRAY$ and min$(a.used, b.used) \le \delta$ then \\ -\hspace{6mm}5.2.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm fast\_s\_mp\_mul\_digs. \\ -\hspace{3mm}5.3 else \\ -\hspace{6mm}5.3.1 $c \leftarrow a \cdot b \mbox{ (mod }\beta^{digs}\mbox{)}$ using algorithm s\_mp\_mul\_digs. \\ -6. $c.sign \leftarrow sign$ \\ -7. Return the result of the unsigned multiplication performed. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_mul} -\end{figure} - -\textbf{Algorithm mp\_mul.} -This algorithm performs the signed multiplication of two inputs. It will make use of any of the three unsigned multiplication algorithms -available when the input is of appropriate size. The \textbf{sign} of the result is not set until the end of the algorithm since algorithm -s\_mp\_mul\_digs will clear it. - -\index{bn\_mp\_mul.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_mul.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* high level multiplication (handles sign) */ -018 int -019 mp_mul (mp_int * a, mp_int * b, mp_int * c) -020 \{ -021 int res, neg; -022 neg = (a->sign == b->sign) ? MP_ZPOS : MP_NEG; -023 -024 if (MIN (a->used, b->used) >= TOOM_MUL_CUTOFF) \{ -025 res = mp_toom_mul(a, b, c); -026 \} else if (MIN (a->used, b->used) >= KARATSUBA_MUL_CUTOFF) \{ -027 res = mp_karatsuba_mul (a, b, c); -028 \} else \{ -029 -030 /* can we use the fast multiplier? -031 * -032 * The fast multiplier can be used if the output will -033 * have less than MP_WARRAY digits and the number of -034 * digits won't affect carry propagation -035 */ -036 int digs = a->used + b->used + 1; -037 -038 if ((digs < MP_WARRAY) && -039 MIN(a->used, b->used) <= -040 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ -041 res = fast_s_mp_mul_digs (a, b, c, digs); -042 \} else \{ -043 res = s_mp_mul (a, b, c); -044 \} -045 -046 \} -047 c->sign = neg; -048 return res; -049 \} -\end{alltt} -\end{small} - -The implementation is rather simplistic and is not particularly noteworthy. Line 22 computes the sign of the result using the ``?'' -operator from the C programming language. Line 40 computes $\delta$ using the fact that $1 << k$ is equal to $2^k$. - -\section{Squaring} - -Squaring is a special case of multiplication where both multiplicands are equal. At first it may seem like there is no significant optimization -available but in fact there is. Consider the multiplication of $576$ against $241$. In total there will be nine single precision multiplications -performed which are $1\cdot 6$, $1 \cdot 7$, $1 \cdot 5$, $4 \cdot 6$, $4 \cdot 7$, $4 \cdot 5$, $2 \cdot 6$, $2 \cdot 7$ and $2 \cdot 5$. Now consider -the multiplication of $123$ against $123$. The nine products are $3 \cdot 3$, $3 \cdot 2$, $3 \cdot 1$, $2 \cdot 3$, $2 \cdot 2$, $2 \cdot 1$, -$1 \cdot 3$, $1 \cdot 2$ and $1 \cdot 1$. On closer inspection some of the products are equivalent. For example, $3 \cdot 2 = 2 \cdot 3$ -and $3 \cdot 1 = 1 \cdot 3$. - -For any $n$-digit input, there are ${{\left (n^2 + n \right)}\over 2}$ possible unique single precision multiplications required compared to the $n^2$ -required for multiplication. The following diagram gives an example of the operations required. - -\begin{figure}[here] -\begin{center} -\begin{tabular}{ccccc|c} -&&1&2&3&\\ -$\times$ &&1&2&3&\\ -\hline && $3 \cdot 1$ & $3 \cdot 2$ & $3 \cdot 3$ & Row 0\\ - & $2 \cdot 1$ & $2 \cdot 2$ & $2 \cdot 3$ && Row 1 \\ - $1 \cdot 1$ & $1 \cdot 2$ & $1 \cdot 3$ &&& Row 2 \\ -\end{tabular} -\end{center} -\caption{Squaring Optimization Diagram} -\end{figure} - -Starting from zero and numbering the columns from right to left a very simple pattern becomes obvious. For the purposes of this discussion let $x$ -represent the number being squared. The first observation is that in row $k$ the $2k$'th column of the product has a $\left (x_k \right)^2$ term in it. - -The second observation is that every column $j$ in row $k$ where $j \ne 2k$ is part of a double product. Every non-square term of a column will -appear twice hence the name ``double product''. Every odd column is made up entirely of double products. In fact every column is made up of double -products and at most one square (\textit{see the exercise section}). - -The third and final observation is that for row $k$ the first unique non-square term, that is, one that hasn't already appeared in an earlier row, -occurs at column $2k + 1$. For example, on row $1$ of the previous squaring, column one is part of the double product with column one from row zero. -Column two of row one is a square and column three is the first unique column. - -\subsection{The Baseline Squaring Algorithm} -The baseline squaring algorithm is meant to be a catch-all squaring algorithm. It will handle any of the input sizes that the faster routines -will not handle. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -1. Init a temporary mp\_int of at least $2 \cdot a.used +1$ digits. (\textit{mp\_init\_size}) \\ -2. If step 1 failed return(\textit{MP\_MEM}) \\ -3. $t.used \leftarrow 2 \cdot a.used + 1$ \\ -4. For $ix$ from 0 to $a.used - 1$ do \\ -\hspace{3mm}Calculate the square. \\ -\hspace{3mm}4.1 $\hat r \leftarrow t_{2ix} + \left (a_{ix} \right )^2$ \\ -\hspace{3mm}4.2 $t_{2ix} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}Calculate the double products after the square. \\ -\hspace{3mm}4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}4.4 For $iy$ from $ix + 1$ to $a.used - 1$ do \\ -\hspace{6mm}4.4.1 $\hat r \leftarrow 2 \cdot a_{ix}a_{iy} + t_{ix + iy} + u$ \\ -\hspace{6mm}4.4.2 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}4.4.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}Set the last carry. \\ -\hspace{3mm}4.5 While $u > 0$ do \\ -\hspace{6mm}4.5.1 $iy \leftarrow iy + 1$ \\ -\hspace{6mm}4.5.2 $\hat r \leftarrow t_{ix + iy} + u$ \\ -\hspace{6mm}4.5.3 $t_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}4.5.4 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -5. Clamp excess digits of $t$. (\textit{mp\_clamp}) \\ -6. Exchange $b$ and $t$. \\ -7. Clear $t$ (\textit{mp\_clear}) \\ -8. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_sqr} -\end{figure} - -\textbf{Algorithm s\_mp\_sqr.} -This algorithm computes the square of an input using the three observations on squaring. It is based fairly faithfully on algorithm 14.16 of HAC -\cite[pp.596-597]{HAC}. Similar to algorithm s\_mp\_mul\_digs, a temporary mp\_int is allocated to hold the result of the squaring. This allows the -destination mp\_int to be the same as the source mp\_int. - -The outer loop of this algorithm begins on step 4. It is best to think of the outer loop as walking down the rows of the partial results, while -the inner loop computes the columns of the partial result. Step 4.1 and 4.2 compute the square term for each row, and step 4.3 and 4.4 propagate -the carry and compute the double products. - -The requirement that a mp\_word be able to represent the range $0 \le x < 2 \beta^2$ arises from this -very algorithm. The product $a_{ix}a_{iy}$ will lie in the range $0 \le x \le \beta^2 - 2\beta + 1$ which is obviously less than $\beta^2$ meaning that -when it is multiplied by two, it can be properly represented by a mp\_word. - -Similar to algorithm s\_mp\_mul\_digs, after every pass of the inner loop, the destination is correctly set to the sum of all of the partial -results calculated so far. This involves expensive carry propagation which will be eliminated in the next algorithm. - -\index{bn\_s\_mp\_sqr.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_sqr.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* low level squaring, b = a*a, HAC pp.596-597, Algorithm 14.16 */ -018 int -019 s_mp_sqr (mp_int * a, mp_int * b) -020 \{ -021 mp_int t; -022 int res, ix, iy, pa; -023 mp_word r; -024 mp_digit u, tmpx, *tmpt; -025 -026 pa = a->used; -027 if ((res = mp_init_size (&t, 2*pa + 1)) != MP_OKAY) \{ -028 return res; -029 \} -030 t.used = 2*pa + 1; -031 -032 for (ix = 0; ix < pa; ix++) \{ -033 /* first calculate the digit at 2*ix */ -034 /* calculate double precision result */ -035 r = ((mp_word) t.dp[2*ix]) + -036 ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); -037 -038 /* store lower part in result */ -039 t.dp[2*ix] = (mp_digit) (r & ((mp_word) MP_MASK)); -040 -041 /* get the carry */ -042 u = (r >> ((mp_word) DIGIT_BIT)); -043 -044 /* left hand side of A[ix] * A[iy] */ -045 tmpx = a->dp[ix]; -046 -047 /* alias for where to store the results */ -048 tmpt = t.dp + (2*ix + 1); -049 -050 for (iy = ix + 1; iy < pa; iy++) \{ -051 /* first calculate the product */ -052 r = ((mp_word) tmpx) * ((mp_word) a->dp[iy]); -053 -054 /* now calculate the double precision result, note we use -055 * addition instead of *2 since it's easier to optimize -056 */ -057 r = ((mp_word) * tmpt) + r + r + ((mp_word) u); -058 -059 /* store lower part */ -060 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); -061 -062 /* get carry */ -063 u = (r >> ((mp_word) DIGIT_BIT)); -064 \} -065 /* propagate upwards */ -066 while (u != ((mp_digit) 0)) \{ -067 r = ((mp_word) * tmpt) + ((mp_word) u); -068 *tmpt++ = (mp_digit) (r & ((mp_word) MP_MASK)); -069 u = (r >> ((mp_word) DIGIT_BIT)); -070 \} -071 \} -072 -073 mp_clamp (&t); -074 mp_exch (&t, b); -075 mp_clear (&t); -076 return MP_OKAY; -077 \} -\end{alltt} -\end{small} - -Inside the outer loop (\textit{see line 32}) the square term is calculated on line 35. Line 42 extracts the carry from the square -term. Aliases for $a_{ix}$ and $t_{ix+iy}$ are initialized on lines 45 and 48 respectively. The doubling is performed using two -additions (\textit{see line 57}) since it is usually faster than shifting,if not at least as fast. - -\subsection{Faster Squaring by the ``Comba'' Method} -A major drawback to the baseline method is the requirement for single precision shifting inside the $O(n^2)$ nested loop. Squaring has an additional -drawback that it must double the product inside the inner loop as well. As for multiplication, the Comba technique can be used to eliminate these -performance hazards. - -The first obvious solution is to make an array of mp\_words which will hold all of the columns. This will indeed eliminate all of the carry -propagation operations from the inner loop. However, the inner product must still be doubled $O(n^2)$ times. The solution stems from the simple fact -that $2a + 2b + 2c = 2(a + b + c)$. That is the sum of all of the double products is equal to double the sum of all the products. For example, -$ab + ba + ac + ca = 2ab + 2ac = 2(ab + ac)$. - -However, we cannot simply double all of the columns, since the squares appear only once per row. The most practical solution is to have two mp\_word -arrays. One array will hold the squares and the other array will hold the double products. With both arrays the doubling and carry propagation can be -moved to a $O(n)$ work level outside the $O(n^2)$ level. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_s\_mp\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -Place two arrays of \textbf{MP\_WARRAY} mp\_words named $\hat W$ and $\hat {X}$ on the stack. \\ -1. If $b.alloc < 2a.used + 1$ then grow $b$ to $2a.used + 1$ digits. (\textit{mp\_grow}). \\ -2. If step 1 failed return(\textit{MP\_MEM}). \\ -3. for $ix$ from $0$ to $2a.used + 1$ do \\ -\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ -\hspace{3mm}3.2 $\hat {X}_{ix} \leftarrow 0$ \\ -4. for $ix$ from $0$ to $a.used - 1$ do \\ -\hspace{3mm}Compute the square.\\ -\hspace{3mm}4.1 $\hat {X}_{ix+ix} \leftarrow \left ( a_ix \right )^2$ \\ -\\ -\hspace{3mm}Compute the double products.\\ -\hspace{3mm}4.2 for $iy$ from $ix + 1$ to $a.used - 1$ do \\ -\hspace{6mm}4.2.1 $\hat W_{ix+iy} \leftarrow \hat W_{ix+iy} + a_{ix}a_{iy}$ \\ -5. $oldused \leftarrow b.used$ \\ -6. $b.used \leftarrow 2a.used + 1$ \\ -\\ -Double the products and propagate the carries simultaneously. \\ -7. $\hat W_0 \leftarrow 2 \hat W_0 + \hat {X}_0$ \\ -8. for $ix$ from $1$ to $2a.used$ do \\ -\hspace{3mm}8.1 $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ \\ -\hspace{3mm}8.2 $\hat W_{ix} \leftarrow \hat W_{ix} + \lfloor \hat W_{ix - 1} / \beta \rfloor$ \\ -\hspace{3mm}8.3 $b_{ix-1} \leftarrow W_{ix-1} \mbox{ (mod }\beta\mbox{)}$ \\ -9. $b_{2a.used} \leftarrow \hat W_{2a.used} \mbox{ (mod }\beta\mbox{)}$ \\ -10. if $2a.used + 1 < oldused$ then do \\ -\hspace{3mm}10.1 for $ix$ from $2a.used + 1$ to $oldused$ do \\ -\hspace{6mm}10.1.1 $b_{ix} \leftarrow 0$ \\ -11. Clamp excess digits from $b$. (\textit{mp\_clamp}) \\ -12. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_s\_mp\_sqr} -\end{figure} - -\textbf{Algorithm fast\_s\_mp\_sqr.} -This algorithm computes the square of an input using the Comba technique. It is designed to be a replacement for algorithm s\_mp\_sqr when -the number of input digits is less than \textbf{MP\_WARRAY} and less than $\delta \over 2$. - -This routine requires two arrays of mp\_words to be placed on the stack. The first array $\hat W$ will hold the double products and the second -array $\hat X$ will hold the squares. Though only at most $MP\_WARRAY \over 2$ words of $\hat X$ are used, it has proven faster on most -processors to simply make it a full size array. - -The loop on step 3 will zero the two arrays to prepare them for the squaring step. Step 4.1 computes the squares of the product. Note how -it simply assigns the value into the $\hat X$ array. The nested loop on step 4.2 computes the doubles of the products. This loop -computes the sum of the products for each column. They are not doubled until later. - -After the squaring loop, the products stored in $\hat W$ musted be doubled and the carries propagated forwards. It makes sense to do both -operations at the same time. The expression $\hat W_{ix} \leftarrow 2 \hat W_{ix} + \hat {X}_{ix}$ computes the sum of the double product and the -squares in place. - -\index{bn\_fast\_s\_mp\_sqr.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_fast\_s\_mp\_sqr.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* fast squaring -018 * -019 * This is the comba method where the columns of the product -020 * are computed first then the carries are computed. This -021 * has the effect of making a very simple inner loop that -022 * is executed the most -023 * -024 * W2 represents the outer products and W the inner. -025 * -026 * A further optimizations is made because the inner -027 * products are of the form "A * B * 2". The *2 part does -028 * not need to be computed until the end which is good -029 * because 64-bit shifts are slow! -030 * -031 * Based on Algorithm 14.16 on pp.597 of HAC. -032 * -033 */ -034 int -035 fast_s_mp_sqr (mp_int * a, mp_int * b) -036 \{ -037 int olduse, newused, res, ix, pa; -038 mp_word W2[MP_WARRAY], W[MP_WARRAY]; -039 -040 /* calculate size of product and allocate as required */ -041 pa = a->used; -042 newused = pa + pa + 1; -043 if (b->alloc < newused) \{ -044 if ((res = mp_grow (b, newused)) != MP_OKAY) \{ -045 return res; -046 \} -047 \} -048 -049 /* zero temp buffer (columns) -050 * Note that there are two buffers. Since squaring requires -051 * a outter and inner product and the inner product requires -052 * computing a product and doubling it (a relatively expensive -053 * op to perform n**2 times if you don't have to) the inner and -054 * outer products are computed in different buffers. This way -055 * the inner product can be doubled using n doublings instead of -056 * n**2 -057 */ -058 memset (W, 0, newused * sizeof (mp_word)); -059 memset (W2, 0, newused * sizeof (mp_word)); -060 -061 /* This computes the inner product. To simplify the inner N**2 loop -062 * the multiplication by two is done afterwards in the N loop. -063 */ -064 for (ix = 0; ix < pa; ix++) \{ -065 /* compute the outer product -066 * -067 * Note that every outer product is computed -068 * for a particular column only once which means that -069 * there is no need todo a double precision addition -070 */ -071 W2[ix + ix] = ((mp_word) a->dp[ix]) * ((mp_word) a->dp[ix]); -072 -073 \{ -074 register mp_digit tmpx, *tmpy; -075 register mp_word *_W; -076 register int iy; -077 -078 /* copy of left side */ -079 tmpx = a->dp[ix]; -080 -081 /* alias for right side */ -082 tmpy = a->dp + (ix + 1); -083 -084 /* the column to store the result in */ -085 _W = W + (ix + ix + 1); -086 -087 /* inner products */ -088 for (iy = ix + 1; iy < pa; iy++) \{ -089 *_W++ += ((mp_word) tmpx) * ((mp_word) * tmpy++); -090 \} -091 \} -092 \} -093 -094 /* setup dest */ -095 olduse = b->used; -096 b->used = newused; -097 -098 /* now compute digits */ -099 \{ -100 register mp_digit *tmpb; -101 -102 /* double first value, since the inner products are -103 * half of what they should be -104 */ -105 W[0] += W[0] + W2[0]; -106 -107 tmpb = b->dp; -108 for (ix = 1; ix < newused; ix++) \{ -109 /* double/add next digit */ -110 W[ix] += W[ix] + W2[ix]; -111 -112 W[ix] = W[ix] + (W[ix - 1] >> ((mp_word) DIGIT_BIT)); -113 *tmpb++ = (mp_digit) (W[ix - 1] & ((mp_word) MP_MASK)); -114 \} -115 /* set the last value. Note even if the carry is zero -116 * this is required since the next step will not zero -117 * it if b originally had a value at b->dp[2*a.used] -118 */ -119 *tmpb++ = (mp_digit) (W[(newused) - 1] & ((mp_word) MP_MASK)); -120 -121 /* clear high digits */ -122 for (; ix < olduse; ix++) \{ -123 *tmpb++ = 0; -124 \} -125 \} -126 -127 mp_clamp (b); -128 return MP_OKAY; -129 \} -\end{alltt} -\end{small} - --- Write something deep and insightful later, Tom. - -\subsection{Polynomial Basis Squaring} -The same algorithm that performs optimal polynomial basis multiplication can be used to perform polynomial basis squaring. The minor exception -is that $\zeta_y = f(y)g(y)$ is actually equivalent to $\zeta_y = f(y)^2$ since $f(y) = g(y)$. Instead of performing $2n + 1$ -multiplications to find the $\zeta$ relations, squaring operations are performed instead. - -\subsection{Karatsuba Squaring} -Let $f(x) = ax + b$ represent the polynomial basis representation of a number to square. -Let $h(x) = \left ( f(x) \right )^2$ represent the square of the polynomial. The Karatsuba equation can be modified to square a -number with the following equation. - -\begin{equation} -h(x) = a^2x^2 + \left (a^2 + b^2 - (a - b)^2 \right )x + b^2 -\end{equation} - -Upon closer inspection this equation only requires the calculation of three half-sized squares: $a^2$, $b^2$ and $(a - b)^2$. As in -Karatsuba multiplication, this algorithm can be applied recursively on the input and will achieve an asymptotic running time of -$O \left ( n^{lg(3)} \right )$. - -You might ask yourself, if the asymptotic time of Karatsuba squaring and multiplication is the same, why not simply use the multiplication algorithm -instead? The answer to this arises from the cutoff point for squaring. As in multiplication there exists a cutoff point, at which the -time required for a Comba based squaring and a Karatsuba based squaring meet. Due to the overhead inherent in the Karatsuba method, the cutoff -point is fairly high. For example, on an AMD Athlon XP processor with $\beta = 2^{28}$, the cutoff point is around 127 digits. - -Consider squaring a 200 digit number with this technique. It will be split into two 100 digit halves which are subsequently squared. -The 100 digit halves will not be squared using Karatsuba, but instead using the faster Comba based squaring algorithm. If Karatsuba multiplication -were used instead, the 100 digit numbers would be squared with a slower Comba based multiplication. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_karatsuba\_sqr}. \\ -\textbf{Input}. mp\_int $a$ \\ -\textbf{Output}. $b \leftarrow a^2$ \\ -\hline \\ -1. Initialize the following temporary mp\_ints: $x0$, $x1$, $t1$, $t2$, $x0x0$ and $x1x1$. \\ -2. If any of the initializations on step 1 failed return(\textit{MP\_MEM}). \\ -\\ -Split the input. e.g. $a = x1\beta^B + x0$ \\ -3. $B \leftarrow \lfloor a.used / 2 \rfloor$ \\ -4. $x0 \leftarrow a \mbox{ (mod }\beta^B\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -5. $x1 \leftarrow \lfloor a / \beta^B \rfloor$ (\textit{mp\_lshd}) \\ -\\ -Calculate the three squares. \\ -6. $x0x0 \leftarrow x0^2$ (\textit{mp\_sqr}) \\ -7. $x1x1 \leftarrow x1^2$ \\ -8. $t1 \leftarrow x1 - x0$ (\textit{mp\_sub}) \\ -9. $t1 \leftarrow t1^2$ \\ -\\ -Compute the middle term. \\ -10. $t2 \leftarrow x0x0 + x1x1$ (\textit{s\_mp\_add}) \\ -11. $t1 \leftarrow t2 - t1$ \\ -\\ -Compute final product. \\ -12. $t1 \leftarrow t1\beta^B$ (\textit{mp\_lshd}) \\ -13. $x1x1 \leftarrow x1x1\beta^{2B}$ \\ -14. $t1 \leftarrow t1 + x0x0$ \\ -15. $b \leftarrow t1 + x1x1$ \\ -16. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_karatsuba\_sqr} -\end{figure} - -\textbf{Algorithm mp\_karatsuba\_sqr.} -This algorithm computes the square of an input $a$ using the Karatsuba technique. This algorithm is very similar to the Karatsuba based -multiplication algorithm with the exception that the three half-size multiplications have been replaced with three half-size squarings. - -The radix point for squaring is simply placed exactly in the middle of the digits when the input has an odd number of digits, otherwise it is -placed just below the middle. Step 3, 4 and 5 compute the two halves required using $B$ -as the radix point. The first two squares in steps 6 and 7 are rather straightforward while the last square is of a more compact form. - -By expanding $\left (x1 - x0 \right )^2$, the $x1^2$ and $x0^2$ terms in the middle disappear, that is $x1^2 + x0^2 - (x1 - x0)^2 = 2 \cdot x0 \cdot x1$. -Now if $5n$ single precision additions and a squaring of $n$-digits is faster than multiplying two $n$-digit numbers and doubling then -this method is faster. Assuming no further recursions occur, the difference can be estimated with the following inequality. - -Let $p$ represent the cost of a single precision addition and $q$ the cost of a single precision multiplication both in terms of time\footnote{Or -machine clock cycles.}. - -\begin{equation} -5pn +{{q(n^2 + n)} \over 2} \le pn + qn^2 -\end{equation} - -For example, on an AMD Athlon XP processor $p = {1 \over 3}$ and $q = 6$. This implies that the following inequality should hold. -\begin{center} -\begin{tabular}{rcl} -${5n \over 3} + 3n^2 + 3n$ & $<$ & ${n \over 3} + 6n^2$ \\ -${5 \over 3} + 3n + 3$ & $<$ & ${1 \over 3} + 6n$ \\ -${13 \over 9}$ & $<$ & $n$ \\ -\end{tabular} -\end{center} - -This results in a cutoff point around $n = 2$. As a consequence it is actually faster to compute the middle term the ``long way'' on processors -where multiplication is substantially slower\footnote{On the Athlon there is a 1:17 ratio between clock cycles for addition and multiplication. On -the Intel P4 processor this ratio is 1:29 making this method even more beneficial. The only common exception is the ARMv4 processor which has a -ratio of 1:7. } than simpler operations such as addition. - -\index{bn\_mp\_karatsuba\_sqr.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_karatsuba\_sqr.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* Karatsuba squaring, computes b = a*a using three -018 * half size squarings -019 * -020 * See comments of mp_karatsuba_mul for details. It -021 * is essentially the same algorithm but merely -022 * tuned to perform recursive squarings. -023 */ -024 int -025 mp_karatsuba_sqr (mp_int * a, mp_int * b) -026 \{ -027 mp_int x0, x1, t1, t2, x0x0, x1x1; -028 int B, err; -029 -030 err = MP_MEM; -031 -032 /* min # of digits */ -033 B = a->used; -034 -035 /* now divide in two */ -036 B = B / 2; -037 -038 /* init copy all the temps */ -039 if (mp_init_size (&x0, B) != MP_OKAY) -040 goto ERR; -041 if (mp_init_size (&x1, a->used - B) != MP_OKAY) -042 goto X0; -043 -044 /* init temps */ -045 if (mp_init_size (&t1, a->used * 2) != MP_OKAY) -046 goto X1; -047 if (mp_init_size (&t2, a->used * 2) != MP_OKAY) -048 goto T1; -049 if (mp_init_size (&x0x0, B * 2) != MP_OKAY) -050 goto T2; -051 if (mp_init_size (&x1x1, (a->used - B) * 2) != MP_OKAY) -052 goto X0X0; -053 -054 \{ -055 register int x; -056 register mp_digit *dst, *src; -057 -058 src = a->dp; -059 -060 /* now shift the digits */ -061 dst = x0.dp; -062 for (x = 0; x < B; x++) \{ -063 *dst++ = *src++; -064 \} -065 -066 dst = x1.dp; -067 for (x = B; x < a->used; x++) \{ -068 *dst++ = *src++; -069 \} -070 \} -071 -072 x0.used = B; -073 x1.used = a->used - B; -074 -075 mp_clamp (&x0); -076 -077 /* now calc the products x0*x0 and x1*x1 */ -078 if (mp_sqr (&x0, &x0x0) != MP_OKAY) -079 goto X1X1; /* x0x0 = x0*x0 */ -080 if (mp_sqr (&x1, &x1x1) != MP_OKAY) -081 goto X1X1; /* x1x1 = x1*x1 */ -082 -083 /* now calc (x1-x0)**2 */ -084 if (mp_sub (&x1, &x0, &t1) != MP_OKAY) -085 goto X1X1; /* t1 = x1 - x0 */ -086 if (mp_sqr (&t1, &t1) != MP_OKAY) -087 goto X1X1; /* t1 = (x1 - x0) * (x1 - x0) */ -088 -089 /* add x0y0 */ -090 if (s_mp_add (&x0x0, &x1x1, &t2) != MP_OKAY) -091 goto X1X1; /* t2 = x0x0 + x1x1 */ -092 if (mp_sub (&t2, &t1, &t1) != MP_OKAY) -093 goto X1X1; /* t1 = x0x0 + x1x1 - (x1-x0)*(x1-x0) */ -094 -095 /* shift by B */ -096 if (mp_lshd (&t1, B) != MP_OKAY) -097 goto X1X1; /* t1 = (x0x0 + x1x1 - (x1-x0)*(x1-x0))<used >= TOOM_SQR_CUTOFF) \{ -023 res = mp_toom_sqr(a, b); -024 \} else if (a->used >= KARATSUBA_SQR_CUTOFF) \{ -025 res = mp_karatsuba_sqr (a, b); -026 \} else \{ -027 -028 /* can we use the fast multiplier? */ -029 if ((a->used * 2 + 1) < MP_WARRAY && -030 a->used < -031 (1 << (sizeof(mp_word) * CHAR_BIT - 2*DIGIT_BIT - 1))) \{ -032 res = fast_s_mp_sqr (a, b); -033 \} else \{ -034 res = s_mp_sqr (a, b); -035 \} -036 \} -037 b->sign = MP_ZPOS; -038 return res; -039 \} -\end{alltt} -\end{small} - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ] $ & Devise an efficient algorithm for selection of the radix point to handle inputs \\ - & that have different number of digits in Karatsuba multiplication. \\ - & \\ -$\left [ 3 \right ] $ & In section 6.3 the fact that every column of a squaring is made up \\ - & of double products and at most one square is stated. Prove this statement. \\ - & \\ -$\left [ 2 \right ] $ & In the Comba squaring algorithm half of the $\hat X$ variables are not used. \\ - & Revise algorithm fast\_s\_mp\_sqr to shrink the $\hat X$ array. \\ - & \\ -$\left [ 3 \right ] $ & Prove the equation for Karatsuba squaring. \\ - & \\ -$\left [ 1 \right ] $ & Prove that Karatsuba squaring requires $O \left (n^{lg(3)} \right )$ time. \\ - & \\ -$\left [ 2 \right ] $ & Determine the minimal ratio between addition and multiplication clock cycles \\ - & required for equation $6.7$ to be true. \\ - & \\ -\end{tabular} - -\chapter{Modular Reduction} -\section{Basics of Modular Reduction} -\index{modular residue} -Modular reduction is an operation that arises quite often within public key cryptography algorithms and various number theoretic algorithms, -such as factoring. Modular reduction algorithms are the third class of algorithms of the ``multipliers'' set. A number $a$ is said to be reduced -modulo another number $b$ by finding the remainder of the division $a/b$. - -Modular reduction is equivalent to solving for $r$ in the following equation. $a = bq + r$ where $q = \lfloor a/b \rfloor$. The result -$r$ is said to be ``congruent to $a$ modulo $b$'' which is also written as $r \equiv a \mbox{ (mod }b\mbox{)}$. In other vernacular $r$ is known as the -``modular residue'' which leads to ``quadratic residue''\footnote{That's fancy talk for $b \equiv a^2 \mbox{ (mod }p\mbox{)}$.} and -other forms of residues. - -\index{modulus} -Modular reductions are normally used to form finite groups such as fields and rings. For example, in the RSA public key algorithm \cite{RSAPAPER} -two private primes $p$ and $q$ are chosen which when multiplied $n = pq$ forms a composite modulus. When operations such as multiplication and -squaring are performed on units of the ring $\Z_n$ a finite multiplicative sub-group is formed. - -Modular reductions have a variety of other useful properties. For example, a number $x$ is a square if and only if it is a quadratic -residue modulo a prime. With a finite set of primes $B = \left < p_0, p_1, \ldots, p_n \right >$ a quick test for whether $x$ is square or not can -be performed\footnote{Provided none of the primes from $B$ divide $x$.}. Consider the figure~\ref{fig:QR} with the candiate $x = 955621$ a simple -set of modular reductions modulo $3, 5, \ldots, 11$ may detect whether $x$ is a square or not. In this case $955621 \equiv 7 \mbox{ (mod }11\mbox{)}$ -and since $7$ is not a quadratic residue modulo $11$ the number $955621$ is not a square. - -\begin{figure} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Prime} & \textbf{Quadratic Residues} \\ -\hline $3$ & $1$ \\ -\hline $5$ & $1, 4$ \\ -\hline $7$ & $1, 2, 4$ \\ -\hline $11$ & $1, 3, 4, 5, 9$ \\ -\hline -\end{tabular} -\end{center} -\caption{Quadratic Residues for primes less than $13$} -\label{fig:QR} -\end{figure} - -The most common usage for performance driven modular reductions is in modular exponentiation algorithms. That is to compute -$d = a^b \mbox{ (mod }c\mbox{)}$ as fast as possible. As will be discussed in the subsequent chapter there exists fast algorithms for computing -modular exponentiations without having to perform (\textit{in this example}) $b - 1$ multiplications. These algorithms will produce partial -results in the range $0 \le x < c^2$ which can be taken advantage of to create several efficient algorithms. - -\section{The Barrett Reduction} -The Barrett reduction algorithm \cite{BARRETT} was inspired by fast division algorithms which multiply by the reciprocal to emulate -division. Barretts observation was that the residue $c$ of $a$ modulo $b$ is equal to - -\begin{equation} -c = a - b \cdot \lfloor a/b \rfloor -\end{equation} - -Since algorithms such as modular exponentiation would be using the same modulus extensively, typical DSP intuition would indicate the next step -would be to replace $a/b$ by a multiplication by the reciprocal. However, DSP intuition on its own will not work as these numbers are considerably -larger than the precision of common DSP floating point data types. It would take another common optimization to optimize the algorithm. - -\subsection{Fixed Point Arithmetic} -The trick used to optimize the above equation is based on a technique of emulating floating point data types with fixed precision integers. Fixed -point arithmetic would vastly popularlize the ``3d-shooter'' genre of games in the mid 1990s when floating point units were fairly slow. The idea behind -fixed point arithmetic is to take a normal $k$-bit integer data type and break it into $p$-bit integer and a $q$-bit fraction part -(\textit{where $p+q = k$}). - -In this system a $k$-bit integer $n$ would actually represent $n/2^q$. For example, with $q = 4$ the integer $n = 37$ would actually represent the -value $2.3125$. To multiply two fixed point numbers the integers are multiplied using traditional arithmetic and subsequently normalized. For example, -with $q = 4$ to multiply the integers $9$ and $5$ they must be converted to fixed point first by multiplying by $2^q$. Let $a = 9(2^q)$ -represent the fixed point representation of $9$ and $b = 5(2^q)$ represent the fixed point representation of $5$. The product $ab$ is equal to -$45(2^{2q})$ which when normalized produces $45(2^q)$. - -Using fixed point arithmetic division can be easily achieved by multiplying by the reciprocal. If $2^q$ is equivalent to one than $2^q/b$ is -equivalent to $1/b$ using real arithmetic. Using this fact dividing an integer $a$ by another integer $b$ can be achieved with the following -expression. - -\begin{equation} -\lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor -\end{equation} - -The precision of the division is proportional to the value of $q$. If the divisor $b$ is used frequently as is the case with -modular exponentiation pre-computing $2^q/b$ will allow a division to be performed with a multiplication and a right shift. Both operations -are considerably faster than division on most processors. - -Consider dividing $19$ by $5$. The correct result is $\lfloor 19/5 \rfloor = 3$. With $q = 3$ the reciprocal is $\lfloor 2^q/5 \rfloor = 1$ which -leads to a product of $19$ which when divided by $2^q$ produces $2$. However, with $q = 4$ the reciprocal is $\lfloor 2^q/5 \rfloor = 3$ and -the result of the emulated division is $\lfloor 3 \cdot 19 / 2^q \rfloor = 3$ which is correct. - -Plugging this form of divison into the original equation the following modular residue equation arises. - -\begin{equation} -c = a - b \cdot \lfloor (a \cdot (\lfloor 2^q / b \rfloor))/2^q \rfloor -\end{equation} - -Using the notation from \cite{BARRETT} the value of $\lfloor 2^q / b \rfloor$ will be represented by the $\mu$ symbol. Using the $\mu$ -variable also helps re-inforce the idea that it is meant to be computed once and re-used. - -\begin{equation} -c = a - b \cdot \lfloor (a \cdot \mu)/2^q \rfloor -\end{equation} - -Provided that $2^q > b^2$ this algorithm will produce a quotient that is either exactly correct or off by a value of one. Let $n$ represent -the number of digits in $b$. This algorithm requires approximately $2n^2$ single precision multiplications to produce the quotient and -another $n^2$ single precision multiplications to find the residue. In total $3n^2$ single precision multiplications are required to -reduce the number. - -For example, if $b = 1179677$ and $q = 41$ ($2^q > b^2$), then the reciprocal $\mu$ is equal to $\lfloor 2^q / b \rfloor = 1864089$. Consider reducing -$a = 180388626447$ modulo $b$ using the above reduction equation. The quotient using the new formula is $\lfloor (a \cdot \mu) / 2^q \rfloor = 152913$. -By subtracting $152913b$ from $a$ the correct residue $a \equiv 677346 \mbox{ (mod }b\mbox{)}$ is found. - -\subsection{Choosing a Radix Point} -Using the fixed point representation a modular reduction can be performed with $3n^2$ single precision multiplications. If that were the best -that could be achieved a full division might as well be used in its place. The key to optimizing the reduction is to reduce the precision of -the initial multiplication that finds the quotient. - -Let $a$ represent the number of which the residue is sought. Let $b$ represent the modulus used to find the residue. Let $m$ represent -the number of digits in $b$. For the purposes of this discussion we will assume that the number of digits in $a$ is $2m$. Dividing $a$ by -$b$ is the same as dividing a $2m$ digit integer by a $m$ digit integer. Digits below the $m - 1$'th digit of $a$ will contribute at most a value -of $1$ to the quotient because $\beta^k < b$ for any $0 \le k \le m - 1$. - -Since those digits do not contribute much to the quotient the observation is that they might as well be zero. However, if the digits -``might as well be zero'' they might as well not be there in the first place. Let $q_0 = \lfloor a/\beta^{m-1} \rfloor$ represent the input -with the zeroes trimmed. Now the modular reduction is trimmed to the almost equivalent equation - -\begin{equation} -c = a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor -\end{equation} - -Note that the original divisor $2^q$ has been replaced with $\beta^{m+1}$. Also note that the exponent on the divisor when added to the amount $q_0$ -was shifted by equals $2m$. If the optimization had not been performed the divisor would have the exponent $2m$ so in the end the exponents -do ``add up''. Using the above equation the quotient $\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ can be off from the true quotient by at most -two implying that $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. By first subtracting $b$ times the quotient and then -conditionally subtracting $b$ once or twice the residue is found. - -The quotient is now found using $(m + 1)(m) = m^2 + m$ single precision multiplications and the residue with an additional $m^2$ single -precision multiplications. In total $2m^2 + m$ single precision multiplications are required which is considerably faster than the original -attempt. - -For example, let $\beta = 10$ represent the radix of the digits. Let $b = 9999$ represent the modulus which implies $m = 4$. Let $a = 99929878$ -represent the value of which the residue is desired. In this case $q = 8$ since $10^7 < 9999^2$ meaning that $\mu = \lfloor \beta^{q}/b \rfloor = 10001$. -With the new observation the multiplicand for the quotient is equal to $q_0 = \lfloor a / \beta^{m - 1} \rfloor = 99929$. The quotient is then -$\lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor = 9993$. Subtracting $9993b$ from $a$ and the correct residue $a \equiv 9871 \mbox{ (mod }b\mbox{)}$ -is found. - -\subsection{Trimming the Quotient} -So far the reduction algorithm has been optimized from $3m^2$ single precision multiplications down to $2m^2 + m$ single precision multiplications. As -it stands now the algorithm is already fairly fast compared to a full integer division algorithm. However, there is still room for -optimization. - -After the first multiplication inside the quotient ($q_0 \cdot \mu$) the value is shifted right by $m + 1$ places effectively nullifying the lower -half of the product. It would be nice to be able to remove those digits from the product to effectively cut down the number of single precision -multiplications. If the number of digits in the modulus $m$ is far less than $\beta$ a full product is not required for the algorithm to work properly. -In fact the lower $m - 2$ digits will not affect the upper half of the product at all and do not need to be computed. - -The value of $\mu$ is a $m$-digit number and $q_0$ is a $m + 1$ digit number. Using a full multiplier $(m + 1)(m) = m^2 + m$ single precision -multiplications would be required. Using a multiplier that will only produce digits at and above the $m - 1$'th digit reduces the number -of single precision multiplications to ${m^2 + m} \over 2$ single precision multiplications. - -\subsection{Trimming the Residue} -After the quotient has been calculated it is used to reduce the input. As previously noted the algorithm is not exact and it can be off by a small -multiple of the modulus, that is $0 \le a - b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor < 3b$. If $b$ is $m$ digits than the -result of reduction equation is a value of at most $m + 1$ digits (\textit{provided $3 < \beta$}) implying that the upper $m - 1$ digits are -implicitly zero. - -The next optimization arises from this very fact. Instead of computing $b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ using a full -$O(m^2)$ multiplication algorithm only the lower $m+1$ digits of the product have to be computed. Similarly the value of $a$ can -be reduced modulo $\beta^{m+1}$ before the multiple of $b$ is subtracted which simplifes the subtraction as well. A multiplication that produces -only the lower $m+1$ digits requires ${m^2 + 3m - 2} \over 2$ single precision multiplications. - -With both optimizations in place the algorithm is the algorithm Barrett proposed. It requires $m^2 + 2m - 1$ single precision multiplications which -is considerably faster than the straightforward $3m^2$ method. - -\subsection{The Barrett Algorithm} -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce}. \\ -\textbf{Input}. mp\_int $a$, mp\_int $b$ and $\mu = \lfloor \beta^{2m}/b \rfloor$ $(0 \le a < b^2, b > 1)$ \\ -\textbf{Output}. $c \leftarrow a \mbox{ (mod }b\mbox{)}$ \\ -\hline \\ -Let $m$ represent the number of digits in $b$. \\ -1. Make a copy of $a$ and store it in $q$. (\textit{mp\_init\_copy}) \\ -2. $q \leftarrow \lfloor q / \beta^{m - 1} \rfloor$ (\textit{mp\_rshd}) \\ -\\ -Produce the quotient. \\ -3. $q \leftarrow q \cdot \mu$ (\textit{note: only produce digits at or above $m-1$}) \\ -4. $q \leftarrow \lfloor q / \beta^{m + 1} \rfloor$ \\ -\\ -Subtract the multiple of modulus from the input. \\ -5. $c \leftarrow a \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -6. $q \leftarrow q \cdot b \mbox{ (mod }\beta^{m+1}\mbox{)}$ (\textit{s\_mp\_mul\_digs}) \\ -7. $c \leftarrow c - q$ (\textit{mp\_sub}) \\ -\\ -Add $\beta^{m+1}$ if a carry occured. \\ -8. If $c < 0$ then (\textit{mp\_cmp\_d}) \\ -\hspace{3mm}8.1 $q \leftarrow 1$ (\textit{mp\_set}) \\ -\hspace{3mm}8.2 $q \leftarrow q \cdot \beta^{m+1}$ (\textit{mp\_lshd}) \\ -\hspace{3mm}8.3 $c \leftarrow c + q$ \\ -\\ -Now subtract the modulus if the residue is too large (e.g. quotient too small). \\ -9. While $c \ge b$ do (\textit{mp\_cmp}) \\ -\hspace{3mm}9.1 $c \leftarrow c - b$ \\ -10. Clear $q$. \\ -11. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce} -\end{figure} - -\textbf{Algorithm mp\_reduce.} -This algorithm will reduce the input $a$ modulo $b$ in place using the Barrett algorithm. It is loosely based on algorithm 14.42 of HAC -\cite[pp. 602]{HAC} which is based on the paper from Paul Barrett \cite{BARRETT}. The algorithm has several restrictions and assumptions which must be adhered to -for the algorithm to work. - -First the modulus $b$ is assumed to be positive and greater than one. If the modulus were less than or equal to one than subtracting -a multiple of it would either accomplish nothing or actually enlarge the input. The input $a$ must be in the range $0 \le a < b^2$ in order -for the quotient to have enough precision. Technically the algorithm will still work if $a \ge b^2$ but it will take much longer to finish. The -value of $\mu$ is passed as an argument to this algorithm and is assumed to be setup before the algorithm is used. - -Recall that the multiplication for the quotient on step 3 must only produce digits at or above the $m-1$'th position. An algorithm called -$s\_mp\_mul\_high\_digs$ which has not been presented is used to accomplish this task. This optimal algorithm can only be used if the number -of digits in $b$ is very much smaller than $\beta$. - -After the multiple of the modulus has been subtracted from $a$ the residue must be fixed up in case its negative. While it is known that -$a \ge b \cdot \lfloor (q_0 \cdot \mu) / \beta^{m+1} \rfloor$ only the lower $m+1$ digits are being used to compute the residue. In this case -the invariant $\beta^{m+1}$ must be added to the residue to make it positive again. - -The while loop at step 9 will subtract $b$ until the residue is less than $b$. If the algorithm is performed correctly this step is only -performed upto two times. However, if $a \ge b^2$ than it will iterate substantially more times than it should. - -\index{bn\_mp\_reduce.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* reduces x mod m, assumes 0 < x < m**2, mu is -018 * precomputed via mp_reduce_setup. -019 * From HAC pp.604 Algorithm 14.42 -020 */ -021 int -022 mp_reduce (mp_int * x, mp_int * m, mp_int * mu) -023 \{ -024 mp_int q; -025 int res, um = m->used; -026 -027 /* q = x */ -028 if ((res = mp_init_copy (&q, x)) != MP_OKAY) \{ -029 return res; -030 \} -031 -032 /* q1 = x / b**(k-1) */ -033 mp_rshd (&q, um - 1); -034 -035 /* according to HAC this is optimization is ok */ -036 if (((unsigned long) m->used) > (((mp_digit)1) << (DIGIT_BIT - 1))) \{ -037 if ((res = mp_mul (&q, mu, &q)) != MP_OKAY) \{ -038 goto CLEANUP; -039 \} -040 \} else \{ -041 if ((res = s_mp_mul_high_digs (&q, mu, &q, um - 1)) != MP_OKAY) \{ -042 goto CLEANUP; -043 \} -044 \} -045 -046 /* q3 = q2 / b**(k+1) */ -047 mp_rshd (&q, um + 1); -048 -049 /* x = x mod b**(k+1), quick (no division) */ -050 if ((res = mp_mod_2d (x, DIGIT_BIT * (um + 1), x)) != MP_OKAY) \{ -051 goto CLEANUP; -052 \} -053 -054 /* q = q * m mod b**(k+1), quick (no division) */ -055 if ((res = s_mp_mul_digs (&q, m, &q, um + 1)) != MP_OKAY) \{ -056 goto CLEANUP; -057 \} -058 -059 /* x = x - q */ -060 if ((res = mp_sub (x, &q, x)) != MP_OKAY) \{ -061 goto CLEANUP; -062 \} -063 -064 /* If x < 0, add b**(k+1) to it */ -065 if (mp_cmp_d (x, 0) == MP_LT) \{ -066 mp_set (&q, 1); -067 if ((res = mp_lshd (&q, um + 1)) != MP_OKAY) -068 goto CLEANUP; -069 if ((res = mp_add (x, &q, x)) != MP_OKAY) -070 goto CLEANUP; -071 \} -072 -073 /* Back off if it's too big */ -074 while (mp_cmp (x, m) != MP_LT) \{ -075 if ((res = s_mp_sub (x, m, x)) != MP_OKAY) \{ -076 break; -077 \} -078 \} -079 -080 CLEANUP: -081 mp_clear (&q); -082 -083 return res; -084 \} -\end{alltt} -\end{small} - -The first multiplication that determines the quotient can be performed by only producing the digits from $m - 1$ and up. This essentially halves -the number of single precision multiplications required. However, the optimization is only safe if $\beta$ is much larger than the number of digits -in the modulus. In the source code this is evaluated on lines 36 to 44 where algorithm s\_mp\_mul\_high\_digs is used when it is -safe to do so. - -\subsection{The Barrett Setup Algorithm} -In order to use algorithm mp\_reduce the value of $\mu$ must be calculated in advance. Ideally this value should be computed once and stored for -future use so that the Barrett algorithm can be used without delay. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_setup}. \\ -\textbf{Input}. mp\_int $a$ ($a > 1$) \\ -\textbf{Output}. $\mu \leftarrow \lfloor \beta^{2m}/a \rfloor$ \\ -\hline \\ -1. $\mu \leftarrow 2^{2 \cdot lg(\beta) \cdot m}$ (\textit{mp\_2expt}) \\ -2. $\mu \leftarrow \lfloor \mu / b \rfloor$ (\textit{mp\_div}) \\ -3. Return(\textit{MP\_OKAY}) \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_setup} -\end{figure} - -\textbf{Algorithm mp\_reduce\_setup.} -This algorithm computes the reciprocal $\mu$ required for Barrett reduction. First $\beta^{2m}$ is calculated as $2^{2 \cdot lg(\beta) \cdot m}$ which -is equivalent and much faster. The final value is computed by taking the integer quotient of $\lfloor \mu / b \rfloor$. - -\index{bn\_mp\_reduce\_setup.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_setup.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* pre-calculate the value required for Barrett reduction -018 * For a given modulus "b" it calulates the value required in "a" -019 */ -020 int -021 mp_reduce_setup (mp_int * a, mp_int * b) -022 \{ -023 int res; -024 -025 if ((res = mp_2expt (a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) \{ -026 return res; -027 \} -028 return mp_div (a, b, a, NULL); -029 \} -\end{alltt} -\end{small} - -This simple routine calculates the reciprocal $\mu$ required by Barrett reduction. Note the extended usage of algorithm mp\_div where the variable -which would received the remainder is passed as NULL. As will be discussed in section 9.1 the division routine allows both the quotient and the -remainder to be passed as NULL meaning to ignore the value. - -\section{The Montgomery Reduction} -Montgomery reduction\footnote{Thanks to Niels Ferguson for his insightful explanation of the algorithm.} \cite{MONT} is by far the most interesting -form of reduction in common use. It computes a modular residue which is not actually equal to the residue of the input yet instead equal to a -residue times a constant. However, as perplexing as this may sound the algorithm is relatively simple and very efficient. - -Throughout this entire section the variable $n$ will represent the modulus used to form the residue. As will be discussed shortly the value of -$n$ must be odd. The variable $x$ will represent the quantity of which the residue is sought. Similar to the Barrett algorithm the input -is restricted to $0 \le x < n^2$. To begin the description some simple number theory facts must be established. - -\textbf{Fact 1.} Adding $n$ to $x$ does not change the residue since in effect it adds one to the quotient $\lfloor x / n \rfloor$. Another way -to explain this is that $n$ (\textit{or multiples of $n$}) is congruent to zero modulo $n$. Adding zero will not change the value of the residue. - -\textbf{Fact 2.} If $x$ is even then performing a division by two in $\Z$ is congruent to $x \cdot 2^{-1} \mbox{ (mod }n\mbox{)}$. Actually -this is an application of the fact that if $x$ is evenly divisible by any $k \in \Z$ then division in $\Z$ will be congruent to -multiplication by $k^{-1}$ modulo $n$. - -From these two simple facts the following simple algorithm can be derived. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction}. \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $1$ to $k$ do \\ -\hspace{3mm}1.1 If $x$ is odd then \\ -\hspace{6mm}1.1.1 $x \leftarrow x + n$ \\ -\hspace{3mm}1.2 $x \leftarrow x/2$ \\ -2. Return $x$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction} -\end{figure} - -The algorithm reduces the input one bit at a time using the two congruencies stated previously. Inside the loop $n$, which is odd, is -added to $x$ if $x$ is odd. This forces $x$ to be even which allows the division by two in $\Z$ to be congruent to a modular division by two. Since -$x$ is assumed to be initially much larger than $n$ the addition of $n$ will contribute an insignificant magnitude to $x$. Let $r$ represent the -final result of the Montgomery algorithm. If $k > lg(n)$ and $0 \le x < n^2$ then the final result is limited to -$0 \le r < \lfloor x/2^k \rfloor + n$. As a result at most a single subtraction is required to get the residue desired. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ -\hline $1$ & $x + n = 5812$, $x/2 = 2906$ \\ -\hline $2$ & $x/2 = 1453$ \\ -\hline $3$ & $x + n = 1710$, $x/2 = 855$ \\ -\hline $4$ & $x + n = 1112$, $x/2 = 556$ \\ -\hline $5$ & $x/2 = 278$ \\ -\hline $6$ & $x/2 = 139$ \\ -\hline $7$ & $x + n = 396$, $x/2 = 198$ \\ -\hline $8$ & $x/2 = 99$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example of Montgomery Reduction (I)} -\label{fig:MONT1} -\end{figure} - -Consider the example in figure~\ref{fig:MONT1} which reduces $x = 5555$ modulo $n = 257$ when $k = 8$. The final result $r = 99$ which is actually -$2^{-8} \cdot 5555 \mbox{ (mod }257\mbox{)}$ can reveal the residue $x \equiv 158$ by multiplying by $2^8$ modulo $n$. - -Let $k = \lfloor lg(n) \rfloor + 1$ represent the number of bits in $n$. The current algorithm requires $2k^2$ single precision shifts -and $k^2$ single precision additions. At this rate the algorithm is most certainly slower than Barrett reduction and not terribly useful. -Fortunately there exists an alternative representation of the algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction} (modified I). \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $2^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 If the $t$'th bit of $x$ is one then \\ -\hspace{6mm}1.1.1 $x \leftarrow x + 2^tn$ \\ -2. Return $x/2^k$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction (modified I)} -\end{figure} - -This algorithm is equivalent since $2^tn$ is a multiple of $n$ and the lower $k$ bits of $x$ are zero by step 2. The number of single -precision shifts has now been reduced from $2k^2$ to $k^2 + k$ which is only a small improvement. - -\begin{figure}[here] -\begin{small} -\begin{center} -\begin{tabular}{|c|l|} -\hline \textbf{Step number ($t$)} & \textbf{Result ($x$)} \\ -\hline $1$ & $x + 2^{0}n = 5812$ \\ -\hline $2$ & $5812$ \\ -\hline $3$ & $x + 2^{2}n = 6840$ \\ -\hline $4$ & $x + 2^{3}n = 8896$ \\ -\hline $5$ & $8896$ \\ -\hline $6$ & $8896$ \\ -\hline $7$ & $x + 2^{6}n = 25344$ \\ -\hline $8$ & $25344$ \\ -\hline -- & $x/2^k = 99$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example of Montgomery Reduction (II)} -\label{fig:MONT2} -\end{figure} - -Figure~\ref{fig:MONT2} demonstrates the modified algorithm reducing $x = 4093$ modulo $n = 257$ with $k = 8$. -With this algorithm a single shift right at the end is the only right shift required to reduce the input instead of $k$ right shifts inside the -loop. Note that for the iterations $t = 2, 5, 6$ and $8$ where the result $x$ is not changed. In those iterations the $t$'th bit of $x$ is -zero and the appropriate multiple of $n$ does not need to be added to force the $t$'th bit of the result to zero. - -\subsection{Digit Based Montgomery Reduction} -Instead of computing the reduction on a bit-by-bit basis it is actually much faster to compute it on digit-by-digit basis. Consider the -previous algorithm re-written to compute the Montgomery reduction in this new fashion. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Montgomery Reduction} (modified II). \\ -\textbf{Input}. Integer $x$, $n$ and $k$ \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. for $t$ from $0$ to $k - 1$ do \\ -\hspace{3mm}1.1 $x \leftarrow x + \mu n \beta^t$ \\ -2. Return $x/\beta^k$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Montgomery Reduction (modified II)} -\end{figure} - -The value $\mu n \beta^t$ is a multiple of the modulus $n$ meaning that it will not change the residue. If the first digit of -the value $\mu n \beta^t$ equals the negative (modulo $\beta$) of the $t$'th digit of $x$ then the addition will result in a zero digit. This -problem breaks down to solving the following congruency. - -\begin{center} -\begin{tabular}{rcl} -$x_t + \mu n_0$ & $\equiv$ & $0 \mbox{ (mod }\beta\mbox{)}$ \\ -$\mu n_0$ & $\equiv$ & $-x_t \mbox{ (mod }\beta\mbox{)}$ \\ -$\mu$ & $\equiv$ & $-x_t/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ -\end{tabular} -\end{center} - -In each iteration of the loop on step 1 a new value of $\mu$ must be calculated. The value of $-1/n_0 \mbox{ (mod }\beta\mbox{)}$ is used -extensively in this algorithm and should be precomputed. Let $\rho$ represent the negative of the modular inverse of $n_0$ modulo $\beta$. - -For example, let $\beta = 10$ represent the radix. Let $n = 17$ represent the modulus which implies $k = 2$ and $\rho \equiv 7$. Let $x = 33$ -represent the value to reduce. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{|c|c|c|} -\hline \textbf{Step ($t$)} & \textbf{Value of $x$} & \textbf{Value of $\mu$} \\ -\hline -- & $33$ & --\\ -\hline $0$ & $33 + \mu n = 50$ & $1$ \\ -\hline $1$ & $50 + \mu n \beta = 900$ & $5$ \\ -\hline -\end{tabular} -\end{center} -\caption{Example of Montgomery Reduction} -\end{figure} - -The final result $900$ is then divided by $\beta^k$ to produce the final result $9$. The first observation is that $9 \nequiv x \mbox{ (mod }n\mbox{)}$ -which implies the result is not the modular residue of $x$ modulo $n$. However, recall that the residue is actually multiplied by $\beta^{-k}$ in -the algorithm. To get the true residue the value must be multiplied by $\beta^k$. In this case $\beta^k \equiv 15 \mbox{ (mod }n\mbox{)}$ and -the correct residue is $9 \cdot 15 \equiv 16 \mbox{ (mod }n\mbox{)}$. - -\subsection{Baseline Montgomery Reduction} -The baseline Montgomery reduction algorithm will produce the residue for any size input. It is designed to be a catch-all algororithm for -Montgomery reductions. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_montgomery\_reduce}. \\ -\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ -\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -1. $digs \leftarrow 2n.used + 1$ \\ -2. If $digs < MP\_ARRAY$ and $m.used < \delta$ then \\ -\hspace{3mm}2.1 Use algorithm fast\_mp\_montgomery\_reduce instead. \\ -\\ -Setup $x$ for the reduction. \\ -3. If $x.alloc < digs$ then grow $x$ to $digs$ digits. \\ -4. $x.used \leftarrow digs$ \\ -\\ -Eliminate the lower $k$ digits. \\ -5. For $ix$ from $0$ to $k - 1$ do \\ -\hspace{3mm}5.1 $\mu \leftarrow x_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}5.2 $u \leftarrow 0$ \\ -\hspace{3mm}5.3 For $iy$ from $0$ to $k - 1$ do \\ -\hspace{6mm}5.3.1 $\hat r \leftarrow \mu n_{iy} + x_{ix + iy} + u$ \\ -\hspace{6mm}5.3.2 $x_{ix + iy} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{6mm}5.3.3 $u \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -\hspace{3mm}5.4 While $u > 0$ do \\ -\hspace{6mm}5.4.1 $iy \leftarrow iy + 1$ \\ -\hspace{6mm}5.4.2 $x_{ix + iy} \leftarrow x_{ix + iy} + u$ \\ -\hspace{6mm}5.4.3 $u \leftarrow \lfloor x_{ix+iy} / \beta \rfloor$ \\ -\hspace{6mm}5.4.4 $x_{ix + iy} \leftarrow x_{ix+iy} \mbox{ (mod }\beta\mbox{)}$ \\ -\\ -Divide by $\beta^k$ and fix up as required. \\ -6. $x \leftarrow \lfloor x / \beta^k \rfloor$ \\ -7. If $x \ge n$ then \\ -\hspace{3mm}7.1 $x \leftarrow x - n$ \\ -8. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_montgomery\_reduce} -\end{figure} - -\textbf{Algorithm mp\_montgomery\_reduce.} -This algorithm reduces the input $x$ modulo $n$ in place using the Montgomery reduction algorithm. The algorithm is loosely based -on algorithm 14.32 of \cite[pp.601]{HAC} except it merges the multiplication of $\mu n \beta^t$ with the addition in the inner loop. The -restrictions on this algorithm are fairly easy to adapt to. First $0 \le x < n^2$ bounds the input to numbers in the same range as -for the Barrett algorithm. Additionally $n > 1$ will ensure a modular inverse $\rho$ exists. $\rho$ must be calculated in -advance of this algorithm. Finally the variable $k$ is fixed and a pseudonym for $n.used$. - -Step 2 decides whether a faster Montgomery algorithm can be used. It is based on the Comba technique meaning that there are limits on -the size of the input. This algorithm is discussed in sub-section 7.3.3. - -Step 5 is the main reduction loop of the algorithm. The value of $\mu$ is calculated once per iteration in the outer loop. The inner loop -calculates $x + \mu n \beta^{ix}$ by multiplying $\mu n$ and adding the result to $x$ shifted by $ix$ digits. Both the addition and -multiplication are performed in the same loop to save time and memory. Step 5.4 will handle any additional carries that escape the inner loop. - -Using a quick inspection this algorithm requires $n$ single precision multiplications for the outer loop and $n^2$ single precision multiplications -in the inner loop. In total $n^2 + n$ single precision multiplications which compares favourably to Barrett at $n^2 + 2n - 1$ single precision -multiplications. - -\index{bn\_mp\_montgomery\_reduce.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_reduce.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* computes xR**-1 == x (mod N) via Montgomery Reduction */ -018 int -019 mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) -020 \{ -021 int ix, res, digs; -022 mp_digit mu; -023 -024 /* can the fast reduction [comba] method be used? -025 * -026 * Note that unlike in mp_mul you're safely allowed *less* -027 * than the available columns [255 per default] since carries -028 * are fixed up in the inner loop. -029 */ -030 digs = n->used * 2 + 1; -031 if ((digs < MP_WARRAY) && -032 n->used < -033 (1 << ((CHAR_BIT * sizeof (mp_word)) - (2 * DIGIT_BIT)))) \{ -034 return fast_mp_montgomery_reduce (x, n, rho); -035 \} -036 -037 /* grow the input as required */ -038 if (x->alloc < digs) \{ -039 if ((res = mp_grow (x, digs)) != MP_OKAY) \{ -040 return res; -041 \} -042 \} -043 x->used = digs; -044 -045 for (ix = 0; ix < n->used; ix++) \{ -046 /* mu = ai * m' mod b */ -047 mu = (x->dp[ix] * rho) & MP_MASK; -048 -049 /* a = a + mu * m * b**i */ -050 \{ -051 register int iy; -052 register mp_digit *tmpn, *tmpx, u; -053 register mp_word r; -054 -055 /* aliases */ -056 tmpn = n->dp; -057 tmpx = x->dp + ix; -058 -059 /* set the carry to zero */ -060 u = 0; -061 -062 /* Multiply and add in place */ -063 for (iy = 0; iy < n->used; iy++) \{ -064 r = ((mp_word) mu) * ((mp_word) * tmpn++) + -065 ((mp_word) u) + ((mp_word) * tmpx); -066 u = (r >> ((mp_word) DIGIT_BIT)); -067 *tmpx++ = (r & ((mp_word) MP_MASK)); -068 \} -069 /* propagate carries */ -070 while (u) \{ -071 *tmpx += u; -072 u = *tmpx >> DIGIT_BIT; -073 *tmpx++ &= MP_MASK; -074 \} -075 \} -076 \} -077 -078 /* x = x/b**n.used */ -079 mp_clamp(x); -080 mp_rshd (x, n->used); -081 -082 /* if A >= m then A = A - m */ -083 if (mp_cmp_mag (x, n) != MP_LT) \{ -084 return s_mp_sub (x, n, x); -085 \} -086 -087 return MP_OKAY; -088 \} -\end{alltt} -\end{small} - -This is the baseline implementation of the Montgomery reduction algorithm. Lines 30 to 35 determine if the Comba based -routine can be used instead. Line 47 computes the value of $\mu$ for that particular iteration of the outer loop. - -The multiplication $\mu n \beta^{ix}$ is performed in one step in the inner loop. The alias $tmpx$ refers to the $ix$'th digit of $x$ and -the alias $tmpn$ refers to the modulus $n$. - -\subsection{Faster ``Comba'' Montgomery Reduction} - -The Montgomery reduction requires fewer single precision multiplications than a Barrett reduction, however it is much slower due to the serial -nature of the inner loop. The Barrett reduction algorithm requires two slightly modified multipliers which can be implemented with the Comba -technique. The Montgomery reduction algorithm cannot directly use the Comba technique to any significant advantage since the inner loop calculates -a $k \times 1$ product $k$ times. - -The biggest obstacle is that at the $ix$'th iteration of the outer loop the value of $x_{ix}$ is required to calculate $\mu$. This means the -carries from $0$ to $ix - 1$ must have been propagated upwards to form a valid $ix$'th digit. The solution as it turns out is very simple. -Perform a Comba like multiplier and inside the outer loop just after the inner loop fix up the $ix + 1$'th digit by forwarding the carry. - -With this change in place the Montgomery reduction algorithm can be performed with a Comba style multiplication loop which substantially increases -the speed of the algorithm. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{fast\_mp\_montgomery\_reduce}. \\ -\textbf{Input}. mp\_int $x$, mp\_int $n$ and a digit $\rho \equiv -1/n_0 \mbox{ (mod }n\mbox{)}$. \\ -\hspace{11.5mm}($0 \le x < n^2, n > 1, (n, \beta) = 1, \beta^k > n$) \\ -\textbf{Output}. $\beta^{-k}x \mbox{ (mod }n\mbox{)}$ \\ -\hline \\ -Place an array of \textbf{MP\_WARRAY} mp\_word variables called $\hat W$ on the stack. \\ -1. if $x.alloc < n.used + 1$ then grow $x$ to $n.used + 1$ digits. \\ -Copy the digits of $x$ into the array $\hat W$ \\ -2. For $ix$ from $0$ to $x.used - 1$ do \\ -\hspace{3mm}2.1 $\hat W_{ix} \leftarrow x_{ix}$ \\ -3. For $ix$ from $x.used$ to $2n.used - 1$ do \\ -\hspace{3mm}3.1 $\hat W_{ix} \leftarrow 0$ \\ -Elimiate the lower $k$ digits. \\ -4. for $ix$ from $0$ to $n.used - 1$ do \\ -\hspace{3mm}4.1 $\mu \leftarrow \hat W_{ix} \cdot \rho \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}4.2 For $iy$ from $0$ to $n.used - 1$ do \\ -\hspace{6mm}4.2.1 $\hat W_{iy + ix} \leftarrow \hat W_{iy + ix} + \mu \cdot n_{iy}$ \\ -\hspace{3mm}4.3 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ -Propagate carries upwards. \\ -5. for $ix$ from $n.used$ to $2n.used + 1$ do \\ -\hspace{3mm}5.1 $\hat W_{ix + 1} \leftarrow \hat W_{ix + 1} + \lfloor \hat W_{ix} / \beta \rfloor$ \\ -Shift right and reduce modulo $\beta$ simultaneously. \\ -6. for $ix$ from $0$ to $n.used + 1$ do \\ -\hspace{3mm}6.1 $x_{ix} \leftarrow \hat W_{ix + n.used} \mbox{ (mod }\beta\mbox{)}$ \\ -Zero excess digits and fixup $x$. \\ -7. if $x.used > n.used + 1$ then do \\ -\hspace{3mm}7.1 for $ix$ from $n.used + 1$ to $x.used - 1$ do \\ -\hspace{6mm}7.1.1 $x_{ix} \leftarrow 0$ \\ -8. $x.used \leftarrow n.used + 1$ \\ -9. Clamp excessive digits of $x$. \\ -10. If $x \ge n$ then \\ -\hspace{3mm}10.1 $x \leftarrow x - n$ \\ -11. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm fast\_mp\_montgomery\_reduce} -\end{figure} - -\textbf{Algorithm fast\_mp\_montgomery\_reduce.} -This algorithm will compute the Montgomery reduction of $x$ modulo $n$ using the Comba technique. It is on most computer platforms significantly -faster than algorithm mp\_montgomery\_reduce and algorithm mp\_reduce (\textit{Barrett reduction}). The algorithm has the same restrictions -on the input as the baseline reduction algorithm. An additional two restrictions are imposed on this algorithm. The number of digits $k$ in the -the modulus $n$ must not violate $MP\_WARRAY > 2k +1$ and $n < \delta$. When $\beta = 2^{28}$ this algorithm can be used to reduce modulo -a modulus of at most $3,556$ bits in length. - -As in the other Comba reduction algorithms there is a $\hat W$ array which stores the columns of the product. It is initially filled with the -contents of $x$ with the excess digits zeroed. The reduction loop is very similar the to the baseline loop at heart. The multiplication on step -4.1 can be single precision only since $ab \mbox{ (mod }\beta\mbox{)} \equiv (a \mbox{ mod }\beta)(b \mbox{ mod }\beta)$. Some multipliers such -as those on the ARM processors take a variable length time to complete depending on the number of bytes of result it must produce. By performing -a single precision multiplication instead half the amount of time is spent. - -Also note that digit $\hat W_{ix}$ must have the carry from the $ix - 1$'th digit propagated upwards in order for this to work. That is what step -4.3 will do. In effect over the $n.used$ iterations of the outer loop the $n.used$'th lower columns all have the their carries propagated forwards. Note -how the upper bits of those same words are not reduced modulo $\beta$. This is because those values will be discarded shortly and there is no -point. - -Step 5 will propgate the remainder of the carries upwards. On step 6 the columns are reduced modulo $\beta$ and shifted simultaneously as they are -stored in the destination $x$. - -\index{bn\_fast\_mp\_montgomery\_reduce.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_fast\_mp\_montgomery\_reduce.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* computes xR**-1 == x (mod N) via Montgomery Reduction -018 * -019 * This is an optimized implementation of mp_montgomery_reduce -020 * which uses the comba method to quickly calculate the columns of the -021 * reduction. -022 * -023 * Based on Algorithm 14.32 on pp.601 of HAC. -024 */ -025 int -026 fast_mp_montgomery_reduce (mp_int * x, mp_int * n, mp_digit rho) -027 \{ -028 int ix, res, olduse; -029 mp_word W[MP_WARRAY]; -030 -031 /* get old used count */ -032 olduse = x->used; -033 -034 /* grow a as required */ -035 if (x->alloc < n->used + 1) \{ -036 if ((res = mp_grow (x, n->used + 1)) != MP_OKAY) \{ -037 return res; -038 \} -039 \} -040 -041 \{ -042 register mp_word *_W; -043 register mp_digit *tmpx; -044 -045 _W = W; -046 tmpx = x->dp; -047 -048 /* copy the digits of a into W[0..a->used-1] */ -049 for (ix = 0; ix < x->used; ix++) \{ -050 *_W++ = *tmpx++; -051 \} -052 -053 /* zero the high words of W[a->used..m->used*2] */ -054 for (; ix < n->used * 2 + 1; ix++) \{ -055 *_W++ = 0; -056 \} -057 \} -058 -059 for (ix = 0; ix < n->used; ix++) \{ -060 /* mu = ai * m' mod b -061 * -062 * We avoid a double precision multiplication (which isn't required) -063 * by casting the value down to a mp_digit. Note this requires -064 * that W[ix-1] have the carry cleared (see after the inner loop) -065 */ -066 register mp_digit mu; -067 mu = (((mp_digit) (W[ix] & MP_MASK)) * rho) & MP_MASK; -068 -069 /* a = a + mu * m * b**i -070 * -071 * This is computed in place and on the fly. The multiplication -072 * by b**i is handled by offseting which columns the results -073 * are added to. -074 * -075 * Note the comba method normally doesn't handle carries in the -076 * inner loop In this case we fix the carry from the previous -077 * column since the Montgomery reduction requires digits of the -078 * result (so far) [see above] to work. This is -079 * handled by fixing up one carry after the inner loop. The -080 * carry fixups are done in order so after these loops the -081 * first m->used words of W[] have the carries fixed -082 */ -083 \{ -084 register int iy; -085 register mp_digit *tmpn; -086 register mp_word *_W; -087 -088 /* alias for the digits of the modulus */ -089 tmpn = n->dp; -090 -091 /* Alias for the columns set by an offset of ix */ -092 _W = W + ix; -093 -094 /* inner loop */ -095 for (iy = 0; iy < n->used; iy++) \{ -096 *_W++ += ((mp_word) mu) * ((mp_word) * tmpn++); -097 \} -098 \} -099 -100 /* now fix carry for next digit, W[ix+1] */ -101 W[ix + 1] += W[ix] >> ((mp_word) DIGIT_BIT); -102 \} -103 -104 -105 \{ -106 register mp_digit *tmpx; -107 register mp_word *_W, *_W1; -108 -109 /* nox fix rest of carries */ -110 _W1 = W + ix; -111 _W = W + ++ix; -112 -113 for (; ix <= n->used * 2 + 1; ix++) \{ -114 *_W++ += *_W1++ >> ((mp_word) DIGIT_BIT); -115 \} -116 -117 /* copy out, A = A/b**n -118 * -119 * The result is A/b**n but instead of converting from an -120 * array of mp_word to mp_digit than calling mp_rshd -121 * we just copy them in the right order -122 */ -123 tmpx = x->dp; -124 _W = W + n->used; -125 -126 for (ix = 0; ix < n->used + 1; ix++) \{ -127 *tmpx++ = *_W++ & ((mp_word) MP_MASK); -128 \} -129 -130 /* zero oldused digits, if the input a was larger than -131 * m->used+1 we'll have to clear the digits */ -132 for (; ix < olduse; ix++) \{ -133 *tmpx++ = 0; -134 \} -135 \} -136 -137 /* set the max used and clamp */ -138 x->used = n->used + 1; -139 mp_clamp (x); -140 -141 /* if A >= m then A = A - m */ -142 if (mp_cmp_mag (x, n) != MP_LT) \{ -143 return s_mp_sub (x, n, x); -144 \} -145 return MP_OKAY; -146 \} -\end{alltt} -\end{small} - -The $\hat W$ array is first filled with digits of $x$ on line 49 then the rest of the digits are zeroed on line 54. Both loops share -the same alias variables to make the code easier to read. - -The value of $\mu$ is calculated in an interesting fashion. First the value $\hat W_{ix}$ is reduced modulo $\beta$ and cast to a mp\_digit. This -forces the compiler to use a single precision multiplication and prevents any concerns about loss of precision. Line 101 fixes the carry -for the next iteration of the loop by propagating the carry from $\hat W_{ix}$ to $\hat W_{ix+1}$. - -The for loop on line 113 propagates the rest of the carries upwards through the columns. The for loop on line 126 reduces the columns -modulo $\beta$ and shifts them $k$ places at the same time. The alias $\_ \hat W$ actually refers to the array $\hat W$ starting at the $n.used$'th -digit, that is $\_ \hat W_{t} = \hat W_{n.used + t}$. - -\subsection{Montgomery Setup} -To calculate the variable $\rho$ a relatively simple algorithm will be required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_montgomery\_setup}. \\ -\textbf{Input}. mp\_int $n$ ($n > 1$ and $(n, 2) = 1$) \\ -\textbf{Output}. $\rho \equiv -1/n_0 \mbox{ (mod }\beta\mbox{)}$ \\ -\hline \\ -1. $b \leftarrow n_0$ \\ -2. If $b$ is even return(\textit{MP\_VAL}) \\ -3. $x \leftarrow ((b + 2) \mbox{ AND } 4) << 1) + b$ \\ -4. for $k$ from 0 to $3$ do \\ -\hspace{3mm}4.1 $x \leftarrow x \cdot (2 - bx)$ \\ -5. $\rho \leftarrow \beta - x \mbox{ (mod }\beta\mbox{)}$ \\ -6. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_montgomery\_setup} -\end{figure} - -\textbf{Algorithm mp\_montgomery\_setup.} -This algorithm will calculate the value of $\rho$ required within the Montgomery reduction algorithms. It uses a very interesting trick -to calculate $1/n_0$ when $\beta$ is a power of two. - -\index{bn\_mp\_montgomery\_setup.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_montgomery\_setup.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* setups the montgomery reduction stuff */ -018 int -019 mp_montgomery_setup (mp_int * n, mp_digit * rho) -020 \{ -021 mp_digit x, b; -022 -023 /* fast inversion mod 2**k -024 * -025 * Based on the fact that -026 * -027 * XA = 1 (mod 2**n) => (X(2-XA)) A = 1 (mod 2**2n) -028 * => 2*X*A - X*X*A*A = 1 -029 * => 2*(1) - (1) = 1 -030 */ -031 b = n->dp[0]; -032 -033 if ((b & 1) == 0) \{ -034 return MP_VAL; -035 \} -036 -037 x = (((b + 2) & 4) << 1) + b; /* here x*a==1 mod 2**4 */ -038 x *= 2 - b * x; /* here x*a==1 mod 2**8 */ -039 #if !defined(MP_8BIT) -040 x *= 2 - b * x; /* here x*a==1 mod 2**16 */ -041 #endif -042 #if defined(MP_64BIT) || !(defined(MP_8BIT) || defined(MP_16BIT)) -043 x *= 2 - b * x; /* here x*a==1 mod 2**32 */ -044 #endif -045 #ifdef MP_64BIT -046 x *= 2 - b * x; /* here x*a==1 mod 2**64 */ -047 #endif -048 -049 /* rho = -1/m mod b */ -050 *rho = (((mp_digit) 1 << ((mp_digit) DIGIT_BIT)) - x) & MP_MASK; -051 -052 return MP_OKAY; -053 \} -\end{alltt} -\end{small} - -This source code computes the value of $\rho$ required to perform Montgomery reduction. It has been modified to avoid performing excess -multiplications when $\beta$ is not the default 28-bits. - -\section{The Diminished Radix Algorithm} -The Diminished Radix method of modular reduction \cite{DRMET} is a fairly clever technique which can be more efficient than either the Barrett -or Montgomery methods for certain forms of moduli. The technique is based on the following simple congruence. - -\begin{equation} -(x \mbox{ mod } n) + k \lfloor x / n \rfloor \equiv x \mbox{ (mod }(n - k)\mbox{)} -\end{equation} - -This observation was used in the MMB \cite{MMB} block cipher to create a diffusion primitive. It used the fact that if $n = 2^{31}$ and $k=1$ that -then a x86 multiplier could produce the 62-bit product and use the ``shrd'' instruction to perform a double-precision right shift. The proof -of the above equation is very simple. First write $x$ in the product form. - -\begin{equation} -x = qn + r -\end{equation} - -Now reduce both sides modulo $(n - k)$. - -\begin{equation} -x \equiv qk + r \mbox{ (mod }(n-k)\mbox{)} -\end{equation} - -The variable $n$ reduces modulo $n - k$ to $k$. By putting $q = \lfloor x/n \rfloor$ and $r = x \mbox{ mod } n$ -into the equation the original congruence is reproduced, thus concluding the proof. The following algorithm is based on this observation. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Diminished Radix Reduction}. \\ -\textbf{Input}. Integer $x$, $n$, $k$ \\ -\textbf{Output}. $x \mbox{ mod } (n - k)$ \\ -\hline \\ -1. $q \leftarrow \lfloor x / n \rfloor$ \\ -2. $q \leftarrow k \cdot q$ \\ -3. $x \leftarrow x \mbox{ (mod }n\mbox{)}$ \\ -4. $x \leftarrow x + q$ \\ -5. If $x \ge (n - k)$ then \\ -\hspace{3mm}5.1 $x \leftarrow x - (n - k)$ \\ -\hspace{3mm}5.2 Goto step 1. \\ -6. Return $x$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm Diminished Radix Reduction} -\label{fig:DR} -\end{figure} - -This algorithm will reduce $x$ modulo $n - k$ and return the residue. If $0 \le x < (n - k)^2$ then the algorithm will loop almost always -once or twice and occasionally three times. For simplicity sake the value of $x$ is bounded by the following simple polynomial. - -\begin{equation} -0 \le x < n^2 + k^2 - 2nk -\end{equation} - -The true bound is $0 \le x < (n - k - 1)^2$ but this has quite a few more terms. The value of $q$ after step 1 is bounded by the following. - -\begin{equation} -q < n - 2k - k^2/n -\end{equation} - -Since $k^2$ is going to be considerably smaller than $n$ that term will always be zero. The value of $x$ after step 3 is bounded trivially as -$0 \le x < n$. By step four the sum $x + q$ is bounded by - -\begin{equation} -0 \le q + x < (k + 1)n - 2k^2 - 1 -\end{equation} - -With a second pass $q$ will be loosely bounded by $0 \le q < k^2$ after step 2 while $x$ will still be loosely bounded by $0 \le x < n$ after step 3. After the second pass it is highly unlike that the -sum in step 4 will exceed $n - k$. In practice fewer than three passes of the algorithm are required to reduce virtually every input in the -range $0 \le x < (n - k - 1)^2$. - -\begin{figure} -\begin{small} -\begin{center} -\begin{tabular}{|l|} -\hline -$x = 123456789, n = 256, k = 3$ \\ -\hline $q \leftarrow \lfloor x/n \rfloor = 482253$ \\ -$q \leftarrow q*k = 1446759$ \\ -$x \leftarrow x \mbox{ mod } n = 21$ \\ -$x \leftarrow x + q = 1446780$ \\ -$x \leftarrow x - (n - k) = 1446527$ \\ -\hline -$q \leftarrow \lfloor x/n \rfloor = 5650$ \\ -$q \leftarrow q*k = 16950$ \\ -$x \leftarrow x \mbox{ mod } n = 127$ \\ -$x \leftarrow x + q = 17077$ \\ -$x \leftarrow x - (n - k) = 16824$ \\ -\hline -$q \leftarrow \lfloor x/n \rfloor = 65$ \\ -$q \leftarrow q*k = 195$ \\ -$x \leftarrow x \mbox{ mod } n = 184$ \\ -$x \leftarrow x + q = 379$ \\ -$x \leftarrow x - (n - k) = 126$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Example Diminished Radix Reduction} -\label{fig:EXDR} -\end{figure} - -Figure~\ref{fig:EXDR} demonstrates the reduction of $x = 123456789$ modulo $n - k = 253$ when $n = 256$ and $k = 3$. Note that even while $x$ -is considerably larger than $(n - k - 1)^2 = 63504$ the algorithm still converges on the modular residue exceedingly fast. In this case only -three passes were required to find the residue $x \equiv 126$. - - -\subsection{Choice of Moduli} -On the surface this algorithm looks like a very expensive algorithm. It requires a couple of subtractions followed by multiplication and other -modular reductions. The usefulness of this algorithm becomes exceedingly clear when an appropriate moduli is chosen. - -Division in general is a very expensive operation to perform. The one exception is when the division is by a power of the radix of representation used. -Division by ten for example is simple for pencil and paper mathematics since it amounts to shifting the decimal place to the right. Similarly division -by two (\textit{or powers of two}) is very simple for binary computers to perform. It would therefore seem logical to choose $n$ of the form $2^p$ -which would imply that $\lfloor x / n \rfloor$ is a simple shift of $x$ right $p$ bits. - -However, there is one operation related to division of power of twos that is even faster than this. If $n = \beta^p$ then the division may be -performed by moving whole digits to the right $p$ places. In practice division by $\beta^p$ is much faster than division by $2^p$ for any $p$. -Also with the choice of $n = \beta^p$ reducing $x$ modulo $n$ requires zeroing the digits above the $p-1$'th digit of $x$. - -Throughout the next section the term ``restricted modulus'' will refer to a modulus of the form $\beta^p - k$ where as the term ``unrestricted -modulus'' will refer to a modulus of the form $2^p - k$. The word ``restricted'' in this case refers to the fact that it is based on the -$2^p$ logic except $p$ must be a multiple of $lg(\beta)$. - -\subsection{Choice of $k$} -Now that division and reduction (\textit{step 1 and 3 of figure~\ref{fig:DR}}) have been optimized to simple digit operations the multiplication by $k$ -in step 2 is the most expensive operation. Fortunately the choice of $k$ is not terribly limited. For all intents and purposes it might -as well be a single digit. The smaller the value of $k$ is the faster the algorithm will be. - -\subsection{Restricted Diminished Radix Reduction} -The restricted Diminished Radix algorithm can quickly reduce an input modulo a modulus of the form $n = \beta^p - k$. This algorithm can reduce -an input $x$ within the range $0 \le x < n^2$ using only a couple passes of the algorithm demonstrated in figure~\ref{fig:DR}. The implementation -of this algorithm has been optimized to avoid additional overhead associated with a division by $\beta^p$, the multiplication by $k$ or the addition -of $x$ and $q$. The resulting algorithm is very efficient and can lead to substantial improvements over Barrett and Montgomery reduction when modular -exponentiations are performed. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_reduce}. \\ -\textbf{Input}. mp\_int $x$, $n$ and a mp\_digit $k = \beta - n_0$ \\ -\hspace{11.5mm}($0 \le x < n^2$, $n > 1$, $0 < k < \beta$) \\ -\textbf{Output}. $x \mbox{ mod } n$ \\ -\hline \\ -1. $m \leftarrow n.used$ \\ -2. If $x.alloc < 2m$ then grow $x$ to $2m$ digits. \\ -3. $\mu \leftarrow 0$ \\ -4. for $i$ from $0$ to $m - 1$ do \\ -\hspace{3mm}4.1 $\hat r \leftarrow k \cdot x_{m+i} + x_{i} + \mu$ \\ -\hspace{3mm}4.2 $x_{i} \leftarrow \hat r \mbox{ (mod }\beta\mbox{)}$ \\ -\hspace{3mm}4.3 $\mu \leftarrow \lfloor \hat r / \beta \rfloor$ \\ -5. $x_{m} \leftarrow \mu$ \\ -6. for $i$ from $m + 1$ to $x.used - 1$ do \\ -\hspace{3mm}6.1 $x_{i} \leftarrow 0$ \\ -7. Clamp excess digits of $x$. \\ -8. If $x \ge n$ then \\ -\hspace{3mm}8.1 $x \leftarrow x - n$ \\ -\hspace{3mm}8.2 Goto step 3. \\ -9. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_reduce} -\end{figure} - -\textbf{Algorithm mp\_dr\_reduce.} -This algorithm will perform the Dimished Radix reduction of $x$ modulo $n$. It has similar restrictions to that of the Barrett reduction -with the addition that $n$ must be of the form $n = \beta^m - k$ where $0 < k <\beta$. - -This algorithm essentially implements the pseudo-code in figure~\ref{fig:DR} except with a slight optimization. The division by $\beta^m$, multiplication by $k$ -and addition of $x \mbox{ mod }\beta^m$ are all performed simultaneously inside the loop on step 4. The division by $\beta^m$ is emulated by accessing -the term at the $m+i$'th position which is subsequently multiplied by $k$ and added to the term at the $i$'th position. After the loop the $m$'th -digit is set to the carry and the upper digits are zeroed. Steps 5 and 6 emulate the reduction modulo $\beta^m$ that should have happend to -$x$ before the addition of the multiple of the upper half. - -At step 8 if $x$ is still larger than $n$ another pass of the algorithm is required. First $n$ is subtracted from $x$ and then the algorithm resumes -at step 3. - -\index{bn\_mp\_dr\_reduce.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_reduce.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* reduce "x" in place modulo "n" using the Diminished Radix algorithm. -018 * -019 * Based on algorithm from the paper -020 * -021 * "Generating Efficient Primes for Discrete Log Cryptosystems" -022 * Chae Hoon Lim, Pil Loong Lee, -023 * POSTECH Information Research Laboratories -024 * -025 * The modulus must be of a special format [see manual] -026 * -027 * Has been modified to use algorithm 7.10 from the LTM book instead -028 */ -029 int -030 mp_dr_reduce (mp_int * x, mp_int * n, mp_digit k) -031 \{ -032 int err, i, m; -033 mp_word r; -034 mp_digit mu, *tmpx1, *tmpx2; -035 -036 /* m = digits in modulus */ -037 m = n->used; -038 -039 /* ensure that "x" has at least 2m digits */ -040 if (x->alloc < m + m) \{ -041 if ((err = mp_grow (x, m + m)) != MP_OKAY) \{ -042 return err; -043 \} -044 \} -045 -046 /* top of loop, this is where the code resumes if -047 * another reduction pass is required. -048 */ -049 top: -050 /* aliases for digits */ -051 /* alias for lower half of x */ -052 tmpx1 = x->dp; -053 -054 /* alias for upper half of x, or x/B**m */ -055 tmpx2 = x->dp + m; -056 -057 /* set carry to zero */ -058 mu = 0; -059 -060 /* compute (x mod B**m) + mp * [x/B**m] inline and inplace */ -061 for (i = 0; i < m; i++) \{ -062 r = ((mp_word)*tmpx2++) * ((mp_word)k) + *tmpx1 + mu; -063 *tmpx1++ = r & MP_MASK; -064 mu = r >> ((mp_word)DIGIT_BIT); -065 \} -066 -067 /* set final carry */ -068 *tmpx1++ = mu; -069 -070 /* zero words above m */ -071 for (i = m + 1; i < x->used; i++) \{ -072 *tmpx1++ = 0; -073 \} -074 -075 /* clamp, sub and return */ -076 mp_clamp (x); -077 -078 /* if x >= n then subtract and reduce again -079 * Each successive "recursion" makes the input smaller and smaller. -080 */ -081 if (mp_cmp_mag (x, n) != MP_LT) \{ -082 s_mp_sub(x, n, x); -083 goto top; -084 \} -085 return MP_OKAY; -086 \} -\end{alltt} -\end{small} - -The first step is to grow $x$ as required to $2m$ digits since the reduction is performed in place on $x$. The label on line 49 is where -the algorithm will resume if further reduction passes are required. In theory it could be placed at the top of the function however, the size of -the modulus and question of whether $x$ is large enough are invariant after the first pass meaning that it would be a waste of time. - -The aliases $tmpx1$ and $tmpx2$ refer to the digits of $x$ where the latter is offset by $m$ digits. By reading digits from $x$ offset by $m$ digits -a division by $\beta^m$ can be simulated virtually for free. The loop on line 61 performs the bulk of the work (\textit{corresponds to step 4 of algorithm 7.11}) -in this algorithm. - -By line 68 the pointer $tmpx1$ points to the $m$'th digit of $x$ which is where the final carry will be placed. Similarly by line 71 the -same pointer will point to the $m+1$'th digit where the zeroes will be placed. - -Since the algorithm is only valid if both $x$ and $n$ are greater than zero an unsigned comparison suffices to determine if another pass is required. -With the same logic at line 82 the value of $x$ is known to be greater than or equal to $n$ meaning that an unsigned subtraction can be used -as well. Since the destination of the subtraction is the larger of the inputs the call to algorithm s\_mp\_sub cannot fail and the return code -does not need to be checked. - -\subsubsection{Setup} -To setup the restricted Diminished Radix algorithm the value $k = \beta - n_0$ is required. This algorithm is not really complicated but provided for -completeness. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_setup}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $k = \beta - n_0$ \\ -\hline \\ -1. $k \leftarrow \beta - n_0$ \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_setup} -\end{figure} - -\index{bn\_mp\_dr\_setup.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_setup.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* determines the setup value */ -018 void mp_dr_setup(mp_int *a, mp_digit *d) -019 \{ -020 /* the casts are required if DIGIT_BIT is one less than -021 * the number of bits in a mp_digit [e.g. DIGIT_BIT==31] -022 */ -023 *d = (mp_digit)((((mp_word)1) << ((mp_word)DIGIT_BIT)) - -024 ((mp_word)a->dp[0])); -025 \} -026 -\end{alltt} -\end{small} - -\subsubsection{Modulus Detection} -Another algorithm which will be useful is the ability to detect a restricted Diminished Radix modulus. An integer is said to be -of restricted Diminished Radix form if all of the digits are equal to $\beta - 1$ except the trailing digit which may be any value. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_dr\_is\_modulus}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $1$ if $n$ is in D.R form, $0$ otherwise \\ -\hline -1. If $n.used < 2$ then return($0$). \\ -2. for $ix$ from $1$ to $n.used - 1$ do \\ -\hspace{3mm}2.1 If $n_{ix} \ne \beta - 1$ return($0$). \\ -3. Return($1$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_dr\_is\_modulus} -\end{figure} - -\textbf{Algorithm mp\_dr\_is\_modulus.} -This algorithm determines if a value is in Diminished Radix form. Step 1 rejects obvious cases where fewer than two digits are -in the mp\_int. Step 2 tests all but the first digit to see if they are equal to $\beta - 1$. If the algorithm manages to get to -step 3 then $n$ must of Diminished Radix form. - -\index{bn\_mp\_dr\_is\_modulus.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_dr\_is\_modulus.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* determines if a number is a valid DR modulus */ -018 int mp_dr_is_modulus(mp_int *a) -019 \{ -020 int ix; -021 -022 /* must be at least two digits */ -023 if (a->used < 2) \{ -024 return 0; -025 \} -026 -027 for (ix = 1; ix < a->used; ix++) \{ -028 if (a->dp[ix] != MP_MASK) \{ -029 return 0; -030 \} -031 \} -032 return 1; -033 \} -034 -\end{alltt} -\end{small} - -\subsection{Unrestricted Diminished Radix Reduction} -The unrestricted Diminished Radix algorithm allows modular reductions to be performed when the modulus is of the form $2^p - k$. This algorithm -is a straightforward adaptation of algorithm~\ref{fig:DR}. - -In general the restricted Diminished Radix reduction algorithm is much faster since it has considerably lower overhead. However, this new -algorithm is much faster than either Montgomery or Barrett reduction when the moduli are of the appropriate form. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_2k}. \\ -\textbf{Input}. mp\_int $a$ and $n$. mp\_digit $k$ \\ -\hspace{11.5mm}($a \ge 0$, $n > 1$, $0 < k < \beta$, $n + k$ is a power of two) \\ -\textbf{Output}. $a \mbox{ (mod }n\mbox{)}$ \\ -\hline -1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ -2. While $a \ge n$ do \\ -\hspace{3mm}2.1 $q \leftarrow \lfloor a / 2^p \rfloor$ (\textit{mp\_div\_2d}) \\ -\hspace{3mm}2.2 $a \leftarrow a \mbox{ (mod }2^p\mbox{)}$ (\textit{mp\_mod\_2d}) \\ -\hspace{3mm}2.3 $q \leftarrow q \cdot k$ (\textit{mp\_mul\_d}) \\ -\hspace{3mm}2.4 $a \leftarrow a - q$ (\textit{s\_mp\_sub}) \\ -\hspace{3mm}2.5 If $a \ge n$ then do \\ -\hspace{6mm}2.5.1 $a \leftarrow a - n$ \\ -3. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_2k} -\end{figure} - -\textbf{Algorithm mp\_reduce\_2k.} -This algorithm quickly reduces an input $a$ modulo an unrestricted Diminished Radix modulus $n$. Division by $2^p$ is emulated with a right -shift which makes the algorithm fairly inexpensive to use. - -\index{bn\_mp\_reduce\_2k.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* reduces a modulo n where n is of the form 2**p - k */ -018 int -019 mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k) -020 \{ -021 mp_int q; -022 int p, res; -023 -024 if ((res = mp_init(&q)) != MP_OKAY) \{ -025 return res; -026 \} -027 -028 p = mp_count_bits(n); -029 top: -030 /* q = a/2**p, a = a mod 2**p */ -031 if ((res = mp_div_2d(a, p, &q, a)) != MP_OKAY) \{ -032 goto ERR; -033 \} -034 -035 if (k != 1) \{ -036 /* q = q * k */ -037 if ((res = mp_mul_d(&q, k, &q)) != MP_OKAY) \{ -038 goto ERR; -039 \} -040 \} -041 -042 /* a = a + q */ -043 if ((res = s_mp_add(a, &q, a)) != MP_OKAY) \{ -044 goto ERR; -045 \} -046 -047 if (mp_cmp_mag(a, n) != MP_LT) \{ -048 s_mp_sub(a, n, a); -049 goto top; -050 \} -051 -052 ERR: -053 mp_clear(&q); -054 return res; -055 \} -056 -\end{alltt} -\end{small} - -The algorithm mp\_count\_bits calculates the number of bits in an mp\_int which is used to find the initial value of $p$. The call to mp\_div\_2d -on line 31 calculates both the quotient $q$ and the remainder $a$ required. By doing both in a single function call the code size -is kept fairly small. The multiplication by $k$ is only performed if $k > 1$. This allows reductions modulo $2^p - 1$ to be performed without -any multiplications. - -The unsigned s\_mp\_add, mp\_cmp\_mag and s\_mp\_sub are used in place of their full sign counterparts since the inputs are only valid if they are -positive. By using the unsigned versions the overhead is kept to a minimum. - -\subsubsection{Unrestricted Setup} -To setup this reduction algorithm the value of $k = 2^p - n$ is required. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_2k\_setup}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $k = 2^p - n$ \\ -\hline -1. $p \leftarrow \lceil lg(n) \rceil$ (\textit{mp\_count\_bits}) \\ -2. $x \leftarrow 2^p$ (\textit{mp\_2expt}) \\ -3. $x \leftarrow x - n$ (\textit{mp\_sub}) \\ -4. $k \leftarrow x_0$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_2k\_setup} -\end{figure} - -\textbf{Algorithm mp\_reduce\_2k\_setup.} -This algorithm computes the value of $k$ required for the algorithm mp\_reduce\_2k. By making a temporary variable $x$ equal to $2^p$ a subtraction -is sufficient to solve for $k$. Alternatively if $n$ has more than one digit the value of $k$ is simply $\beta - n_0$. - -\index{bn\_mp\_reduce\_2k\_setup.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_2k\_setup.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* determines the setup value */ -018 int -019 mp_reduce_2k_setup(mp_int *a, mp_digit *d) -020 \{ -021 int res, p; -022 mp_int tmp; -023 -024 if ((res = mp_init(&tmp)) != MP_OKAY) \{ -025 return res; -026 \} -027 -028 p = mp_count_bits(a); -029 if ((res = mp_2expt(&tmp, p)) != MP_OKAY) \{ -030 mp_clear(&tmp); -031 return res; -032 \} -033 -034 if ((res = s_mp_sub(&tmp, a, &tmp)) != MP_OKAY) \{ -035 mp_clear(&tmp); -036 return res; -037 \} -038 -039 *d = tmp.dp[0]; -040 mp_clear(&tmp); -041 return MP_OKAY; -042 \} -\end{alltt} -\end{small} - -\subsubsection{Unrestricted Detection} -An integer $n$ is a valid unrestricted Diminished Radix modulus if either of the following are true. - -\begin{enumerate} -\item The number has only one digit. -\item The number has more than one digit and every bit from the $\beta$'th to the most significant is one. -\end{enumerate} - -If either condition is true than there is a power of two namely $2^p$ such that $0 < 2^p - n < \beta$. If the input is only -one digit than it will always be of the correct form. Otherwise all of the bits above the first digit must be one. This arises from the fact -that there will be value of $k$ that when added to the modulus causes a carry in the first digit which propagates all the way to the most -significant bit. The resulting sum will be a power of two. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_reduce\_is\_2k}. \\ -\textbf{Input}. mp\_int $n$ \\ -\textbf{Output}. $1$ if of proper form, $0$ otherwise \\ -\hline -1. If $n.used = 0$ then return($0$). \\ -2. If $n.used = 1$ then return($1$). \\ -3. $p \leftarrow \rceil lg(n) \lceil$ (\textit{mp\_count\_bits}) \\ -4. for $x$ from $lg(\beta)$ to $p$ do \\ -\hspace{3mm}4.1 If the ($x \mbox{ mod }lg(\beta)$)'th bit of the $\lfloor x / lg(\beta) \rfloor$ of $n$ is zero then return($0$). \\ -5. Return($1$). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_reduce\_is\_2k} -\end{figure} - -\textbf{Algorithm mp\_reduce\_is\_2k.} -This algorithm quickly determines if a modulus is of the form required for algorithm mp\_reduce\_2k to function properly. - -\index{bn\_mp\_reduce\_is\_2k.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_reduce\_is\_2k.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* determines if mp_reduce_2k can be used */ -018 int -019 mp_reduce_is_2k(mp_int *a) -020 \{ -021 int ix, iy; -022 -023 if (a->used == 0) \{ -024 return 0; -025 \} else if (a->used == 1) \{ -026 return 1; -027 \} else if (a->used > 1) \{ -028 iy = mp_count_bits(a); -029 for (ix = DIGIT_BIT; ix < iy; ix++) \{ -030 if ((a->dp[ix/DIGIT_BIT] & -031 ((mp_digit)1 << (mp_digit)(ix % DIGIT_BIT))) == 0) \{ -032 return 0; -033 \} -034 \} -035 \} -036 return 1; -037 \} -038 -\end{alltt} -\end{small} - - - -\section{Algorithm Comparison} -So far three very different algorithms for modular reduction have been discussed. Each of the algorithms have their own strengths and weaknesses -that makes having such a selection very useful. The following table sumarizes the three algorithms along with comparisons of work factors. Since -all three algorithms have the restriction that $0 \le x < n^2$ and $n > 1$ those limitations are not included in the table. - -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Method} & \textbf{Work Required} & \textbf{Limitations} & \textbf{$m = 8$} & \textbf{$m = 32$} & \textbf{$m = 64$} \\ -\hline Barrett & $m^2 + 2m - 1$ & None & $79$ & $1087$ & $4223$ \\ -\hline Montgomery & $m^2 + m$ & $n$ must be odd & $72$ & $1056$ & $4160$ \\ -\hline D.R. & $2m$ & $n = \beta^m - k$ & $16$ & $64$ & $128$ \\ -\hline -\end{tabular} -\end{small} -\end{center} - -In theory Montgomery and Barrett reductions would require roughly the same amount of time to complete. However, in practice since Montgomery -reduction can be written as a single function with the Comba technique it is much faster. Barrett reduction suffers from the overhead of -calling the half precision multipliers, addition and division by $\beta$ algorithms. - -For almost every cryptographic algorithm Montgomery reduction is the algorithm of choice. The one set of algorithms where Diminished Radix reduction truly -shines are based on the discrete logarithm problem such as Diffie-Hellman \cite{DH} and ElGamal \cite{ELGAMAL}. In these algorithms -primes of the form $\beta^m - k$ can be found and shared amongst users. These primes will allow the Diminished Radix algorithm to be used in -modular exponentiation to greatly speed up the operation. - - - -\section*{Exercises} -\begin{tabular}{cl} -$\left [ 3 \right ]$ & Prove that the ``trick'' in algorithm mp\_montgomery\_setup actually \\ - & calculates the correct value of $\rho$. \\ - & \\ -$\left [ 2 \right ]$ & Devise an algorithm to reduce modulo $n + k$ for small $k$ quickly. \\ - & \\ -$\left [ 4 \right ]$ & Prove that the pseudo-code algorithm ``Diminished Radix Reduction'' \\ - & (\textit{figure~\ref{fig:DR}}) terminates. Also prove the probability that it will \\ - & terminate within $1 \le k \le 10$ iterations. \\ - & \\ -\end{tabular} - - -\chapter{Exponentiation} -Exponentiation is the operation of raising one variable to the power of another, for example, $a^b$. A variant of exponentiation, computed -in a finite field or ring, is called modular exponentiation. This latter style of operation is typically used in public key -cryptosystems such as RSA and Diffie-Hellman. The ability to quickly compute modular exponentiations is of great benefit to any -such cryptosystem and many methods have been sought to speed it up. - -\section{Exponentiation Basics} -A trivial algorithm would simply multiply $a$ against itself $b - 1$ times to compute the exponentiation desired. However, as $b$ grows in size -the number of multiplications becomes prohibitive. Imagine what would happen if $b$ $\approx$ $2^{1024}$ as is the case when computing an RSA signature -with a $1024$-bit key. Such a calculation could never be completed as it would take simply far too long. - -Fortunately there is a very simple algorithm based on the laws of exponents. Recall that $lg_a(a^b) = b$ and that $lg_a(a^ba^c) = b + c$ which -are two trivial relationships between the base and the exponent. Let $b_i$ represent the $i$'th bit of $b$ starting from the least -significant bit. If $b$ is a $k$-bit integer than the following equation is true. - -\begin{equation} -a^b = \prod_{i=0}^{k-1} a^{2^i \cdot b_i} -\end{equation} - -By taking the base $a$ logarithm of both sides of the equation the following equation is the result. - -\begin{equation} -b = \sum_{i=0}^{k-1}2^i \cdot b_i -\end{equation} - -The term $a^{2^i}$ can be found from the $i - 1$'th term by squaring the term since $\left ( a^{2^i} \right )^2$ is equal to -$a^{2^{i+1}}$. This observation forms the basis of essentially all fast exponentiation algorithms. It requires $k$ squarings and on average -$k \over 2$ multiplications to compute the result. This is indeed quite an improvement over simply multiplying by $a$ a total of $b-1$ times. - -While this current method is a considerable speed up there are further improvements to be made. For example, the $a^{2^i}$ term does not need to -be computed in an auxilary variable. Consider the following equivalent algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Left to Right Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$ and $k$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $k - 1$ to $0$ do \\ -\hspace{3mm}2.1 $c \leftarrow c^2$ \\ -\hspace{3mm}2.2 $c \leftarrow c \cdot a^{b_i}$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Left to Right Exponentiation} -\label{fig:LTOR} -\end{figure} - -This algorithm starts from the most significant bit and works towards the least significant bit. When the $i$'th bit of $b$ is set $a$ is -multiplied against the current product. In each iteration the product is squared which doubles the exponent of the individual terms of the -product. - -For example, let $b = 101100_2 \equiv 44_{10}$. The following chart demonstrates the actions of the algorithm. - -\newpage\begin{figure} -\begin{center} -\begin{tabular}{|c|c|} -\hline \textbf{Value of $i$} & \textbf{Value of $c$} \\ -\hline - & $1$ \\ -\hline $5$ & $a$ \\ -\hline $4$ & $a^2$ \\ -\hline $3$ & $a^4 \cdot a$ \\ -\hline $2$ & $a^8 \cdot a^2 \cdot a$ \\ -\hline $1$ & $a^{16} \cdot a^4 \cdot a^2$ \\ -\hline $0$ & $a^{32} \cdot a^8 \cdot a^4$ \\ -\hline -\end{tabular} -\end{center} -\caption{Example of Left to Right Exponentiation} -\end{figure} - -When the product $a^{32} \cdot a^8 \cdot a^4$ is simplified it is equal $a^{44}$ which is the desired exponentiation. This particular algorithm is -called ``Left to Right'' because it reads the exponent in that order. All of the exponentiation algorithms that will be presented are of this nature. - -\subsection{Single Digit Exponentiation} -The first algorithm in the series of exponentiation algorithms will be an unbounded algorithm where the exponent is a single digit. It is intended -to be used when a small power of an input is required (\textit{e.g. $a^5$}). It is faster than simply multiplying $b - 1$ times for all values of -$b$ that are greater than three. - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_expt\_d}. \\ -\textbf{Input}. mp\_int $a$ and mp\_digit $b$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $g \leftarrow a$ (\textit{mp\_init\_copy}) \\ -2. $c \leftarrow 1$ (\textit{mp\_set}) \\ -3. for $x$ from 1 to $lg(\beta)$ do \\ -\hspace{3mm}3.1 $c \leftarrow c^2$ (\textit{mp\_sqr}) \\ -\hspace{3mm}3.2 If $b$ AND $2^{lg(\beta) - 1} \ne 0$ then \\ -\hspace{6mm}3.2.1 $c \leftarrow c \cdot g$ (\textit{mp\_mul}) \\ -\hspace{3mm}3.3 $b \leftarrow b << 1$ \\ -4. Clear $g$. \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_expt\_d} -\end{figure} - -\textbf{Algorithm mp\_expt\_d.} -This algorithm computes the value of $a$ raised to the power of a single digit $b$. It uses the left to right exponentiation algorithm to -quickly compute the exponentiation. It is loosely based on algorithm 14.79 of HAC \cite[pp. 615]{HAC} with the difference that the -exponent is a fixed width. - -A copy of $a$ is made first to allow destination variable $c$ be the same as the source variable $a$. The result is set to the initial value of -$1$ in the subsequent step. - -Inside the loop the exponent is read from the most significant bit first down to the least significant bit. First $c$ is invariably squared -on step 3.1. In the following step if the most significant bit of $b$ is one the copy of $a$ is multiplied against $c$. The value -of $b$ is shifted left one bit to make the next bit down from the most signficant bit the new most significant bit. In effect each -iteration of the loop moves the bits of the exponent $b$ upwards to the most significant location. - -\index{bn\_mp\_expt\_d.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_expt\_d.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* calculate c = a**b using a square-multiply algorithm */ -018 int -019 mp_expt_d (mp_int * a, mp_digit b, mp_int * c) -020 \{ -021 int res, x; -022 mp_int g; -023 -024 if ((res = mp_init_copy (&g, a)) != MP_OKAY) \{ -025 return res; -026 \} -027 -028 /* set initial result */ -029 mp_set (c, 1); -030 -031 for (x = 0; x < (int) DIGIT_BIT; x++) \{ -032 /* square */ -033 if ((res = mp_sqr (c, c)) != MP_OKAY) \{ -034 mp_clear (&g); -035 return res; -036 \} -037 -038 /* if the bit is set multiply */ -039 if ((b & (mp_digit) (((mp_digit)1) << (DIGIT_BIT - 1))) != 0) \{ -040 if ((res = mp_mul (c, &g, c)) != MP_OKAY) \{ -041 mp_clear (&g); -042 return res; -043 \} -044 \} -045 -046 /* shift to next bit */ -047 b <<= 1; -048 \} -049 -050 mp_clear (&g); -051 return MP_OKAY; -052 \} -\end{alltt} -\end{small} - --- Some note later. - -\section{$k$-ary Exponentiation} -When calculating an exponentiation the most time consuming bottleneck is the multiplications which are in general a small factor -slower than squaring. Recall from the previous algorithm that $b_{i}$ refers to the $i$'th bit of the exponent $b$. Suppose instead it referred to -the $i$'th $k$-bit digit of the exponent of $b$. For $k = 1$ the definitions are synonymous and for $k > 1$ algorithm~\ref{fig:KARY} -computes the same exponentiation. A group of $k$ bits from the exponent is called a \textit{window}. That is it is a small window on only a -portion of the entire exponent. Consider the following modification to the basic left to right exponentiation algorithm. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{$k$-ary Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $t - 1$ to $0$ do \\ -\hspace{3mm}2.1 $c \leftarrow c^{2^k} $ \\ -\hspace{3mm}2.2 Extract the $i$'th $k$-bit word from $b$ and store it in $g$. \\ -\hspace{3mm}2.3 $c \leftarrow c \cdot a^g$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{$k$-ary Exponentiation} -\label{fig:KARY} -\end{figure} - -The squaring on step 2.1 can be calculated by squaring the value $c$ successively $k$ times. If the values of $a^g$ for $0 < g < 2^k$ have been -precomputed this algorithm requires only $t$ multiplications and $tk$ squarings. The table can be generated with $2^{k - 1} - 1$ squarings and -$2^{k - 1} + 1$ multiplications. This algorithm assumes that the number of bits in the exponent is evenly divisible by $k$. -However, when it is not the remaining $0 < x \le k - 1$ bits can be handled with algorithm~\ref{fig:LTOR}. - -Suppose $k = 4$ and $t = 100$. This modified algorithm will require $109$ multiplications and $408$ squarings to compute the exponentiation. The -original algorithm would on average have required $200$ multiplications and $400$ squrings to compute the same value. The total number of squarings -has increased slightly but the number of multiplications has nearly halved. - -\subsection{Optimal Values of $k$} -An optimal value of $k$ will minimize $2^{k} + \lceil n / k \rceil + n - 1$ for a fixed number of bits in the exponent $n$. The simplest -approach is to brute force search amongst the values $k = 2, 3, \ldots, 8$ for the lowest result. Table~\ref{fig:OPTK} lists optimal values of $k$ -for various exponent sizes and compares the number of multiplication and squarings required against algorithm~\ref{fig:LTOR}. - -\begin{figure}[here] -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:LTOR}} \\ -\hline $16$ & $2$ & $27$ & $24$ \\ -\hline $32$ & $3$ & $49$ & $48$ \\ -\hline $64$ & $3$ & $92$ & $96$ \\ -\hline $128$ & $4$ & $175$ & $192$ \\ -\hline $256$ & $4$ & $335$ & $384$ \\ -\hline $512$ & $5$ & $645$ & $768$ \\ -\hline $1024$ & $6$ & $1257$ & $1536$ \\ -\hline $2048$ & $6$ & $2452$ & $3072$ \\ -\hline $4096$ & $7$ & $4808$ & $6144$ \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Optimal Values of $k$ for $k$-ary Exponentiation} -\label{fig:OPTK} -\end{figure} - -\subsection{Sliding-Window Exponentiation} -A simple modification to the previous algorithm is only generate the upper half of the table in the range $2^{k-1} \le g < 2^k$. Essentially -this is a table for all values of $g$ where the most significant bit of $g$ is a one. However, in order for this to be allowed in the -algorithm values of $g$ in the range $0 \le g < 2^{k-1}$ must be avoided. - -Table~\ref{fig:OPTK2} lists optimal values of $k$ for various exponent sizes and compares the work required against algorithm~\ref{fig:KARY}. - -\begin{figure}[here] -\begin{center} -\begin{small} -\begin{tabular}{|c|c|c|c|c|c|} -\hline \textbf{Exponent (bits)} & \textbf{Optimal $k$} & \textbf{Work at $k$} & \textbf{Work with ~\ref{fig:KARY}} \\ -\hline $16$ & $3$ & $24$ & $27$ \\ -\hline $32$ & $3$ & $45$ & $49$ \\ -\hline $64$ & $4$ & $87$ & $92$ \\ -\hline $128$ & $4$ & $167$ & $175$ \\ -\hline $256$ & $5$ & $322$ & $335$ \\ -\hline $512$ & $6$ & $628$ & $645$ \\ -\hline $1024$ & $6$ & $1225$ & $1257$ \\ -\hline $2048$ & $7$ & $2403$ & $2452$ \\ -\hline $4096$ & $8$ & $4735$ & $4808$ \\ -\hline -\end{tabular} -\end{small} -\end{center} -\caption{Optimal Values of $k$ for Sliding Window Exponentiation} -\label{fig:OPTK2} -\end{figure} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{Sliding Window $k$-ary Exponentiation}. \\ -\textbf{Input}. Integer $a$, $b$, $k$ and $t$ \\ -\textbf{Output}. $c = a^b$ \\ -\hline \\ -1. $c \leftarrow 1$ \\ -2. for $i$ from $t - 1$ to $0$ do \\ -\hspace{3mm}2.1 If the $i$'th bit of $b$ is a zero then \\ -\hspace{6mm}2.1.1 $c \leftarrow c^2$ \\ -\hspace{3mm}2.2 else do \\ -\hspace{6mm}2.2.1 $c \leftarrow c^{2^k}$ \\ -\hspace{6mm}2.2.2 Extract the $k$ bits from $(b_{i}b_{i-1}\ldots b_{i-(k-1)})$ and store it in $g$. \\ -\hspace{6mm}2.2.3 $c \leftarrow c \cdot a^g$ \\ -\hspace{6mm}2.2.4 $i \leftarrow i - k$ \\ -3. Return $c$. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Sliding Window $k$-ary Exponentiation} -\end{figure} - -Similar to the previous algorithm this algorithm must have a special handler when fewer than $k$ bits are left in the exponent. While this -algorithm requires the same number of squarings it can potentially have fewer multiplications. The pre-computed table $a^g$ is also half -the size as the previous table. - -Consider the exponent $b = 111101011001000_2 \equiv 31432_{10}$ with $k = 3$ using both algorithms. The first algorithm will divide the exponent up as -the following five $3$-bit words $b \equiv \left ( 111, 101, 011, 001, 000 \right )_{2}$. The second algorithm will break the -exponent as $b \equiv \left ( 111, 101, 0, 110, 0, 100, 0 \right )_{2}$. The single digit $0$ in the second representation are where -a single squaring took place instead of a squaring and multiplication. In total the first method requires $10$ multiplications and $18$ -squarings. The second method requires $8$ multiplications and $18$ squarings. - -In general the sliding window method is never slower than the generic $k$-ary method and often it is slightly faster. - -\section{Modular Exponentiation} - -Modular exponentiation is essentially computing the power of a base within a finite field or ring. For example, computing -$d \equiv a^b \mbox{ (mod }c\mbox{)}$ is a modular exponentiation. Instead of first computing $a^b$ and then reducing it -modulo $c$ the intermediate result is reduced modulo $c$ after every squaring or multiplication operation. - -This guarantees that any intermediate result is bounded by $0 \le d \le c^2 - 2c + 1$ and can be reduced modulo $c$ quickly using -one of the algorithms presented in chapter seven. - -Before the actual modular exponentiation algorithm can be written a wrapper algorithm must be written first. This algorithm -will allow the exponent $b$ to be negative which is computed as $c \equiv \left (1 / a \right )^{\vert b \vert} \mbox{(mod }d\mbox{)}$. The -value of $(1/a) \mbox{ mod }c$ is computed using the modular inverse (\textit{see section 10.4}). If no inverse exists the algorithm -terminates with an error. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_exptmod}. \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -1. If $c.sign = MP\_NEG$ return(\textit{MP\_VAL}). \\ -2. If $b.sign = MP\_NEG$ then \\ -\hspace{3mm}2.1 $g' \leftarrow g^{-1} \mbox{ (mod }c\mbox{)}$ \\ -\hspace{3mm}2.2 $x' \leftarrow \vert x \vert$ \\ -\hspace{3mm}2.3 Compute $d \equiv g'^{x'} \mbox{ (mod }c\mbox{)}$ via recursion. \\ -3. if $p$ is odd \textbf{OR} $p$ is a D.R. modulus then \\ -\hspace{3mm}3.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm mp\_exptmod\_fast. \\ -4. else \\ -\hspace{3mm}4.1 Compute $y \equiv g^{x} \mbox{ (mod }p\mbox{)}$ via algorithm s\_mp\_exptmod. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_exptmod} -\end{figure} - -\textbf{Algorithm mp\_exptmod.} -The first algorithm which actually performs modular exponentiation is algorithm s\_mp\_exptmod. It is a sliding window $k$-ary algorithm -which uses Barrett reduction to reduce the product modulo $p$. The second algorithm mp\_exptmod\_fast performs the same operation -except it uses either Montgomery or Diminished Radix reduction. The two latter reduction algorithms are clumped in the same exponentiation -algorithm since their arguments are essentially the same (\textit{two mp\_ints and one mp\_digit}). - -\index{bn\_mp\_exptmod.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_exptmod.c -\vspace{-3mm} -\begin{alltt} -016 -017 -018 /* this is a shell function that calls either the normal or Montgomery -019 * exptmod functions. Originally the call to the montgomery code was -020 * embedded in the normal function but that wasted alot of stack space -021 * for nothing (since 99% of the time the Montgomery code would be called) -022 */ -023 int -024 mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) -025 \{ -026 int dr; -027 -028 /* modulus P must be positive */ -029 if (P->sign == MP_NEG) \{ -030 return MP_VAL; -031 \} -032 -033 /* if exponent X is negative we have to recurse */ -034 if (X->sign == MP_NEG) \{ -035 mp_int tmpG, tmpX; -036 int err; -037 -038 /* first compute 1/G mod P */ -039 if ((err = mp_init(&tmpG)) != MP_OKAY) \{ -040 return err; -041 \} -042 if ((err = mp_invmod(G, P, &tmpG)) != MP_OKAY) \{ -043 mp_clear(&tmpG); -044 return err; -045 \} -046 -047 /* now get |X| */ -048 if ((err = mp_init(&tmpX)) != MP_OKAY) \{ -049 mp_clear(&tmpG); -050 return err; -051 \} -052 if ((err = mp_abs(X, &tmpX)) != MP_OKAY) \{ -053 mp_clear_multi(&tmpG, &tmpX, NULL); -054 return err; -055 \} -056 -057 /* and now compute (1/G)**|X| instead of G**X [X < 0] */ -058 err = mp_exptmod(&tmpG, &tmpX, P, Y); -059 mp_clear_multi(&tmpG, &tmpX, NULL); -060 return err; -061 \} -062 -063 dr = mp_dr_is_modulus(P); -064 if (dr == 0) \{ -065 dr = mp_reduce_is_2k(P) << 1; -066 \} -067 -068 /* if the modulus is odd or dr != 0 use the fast method */ -069 if (mp_isodd (P) == 1 || dr != 0) \{ -070 return mp_exptmod_fast (G, X, P, Y, dr); -071 \} else \{ -072 return s_mp_exptmod (G, X, P, Y); -073 \} -074 \} -075 -\end{alltt} -\end{small} - -In order to keep the algorithms in a known state the first step on line 29 is to reject any negative modulus as input. If the exponent is -negative the algorithm tries to perform a modular exponentiation with the modular inverse of the base $G$. The temporary variable $tmpG$ is assigned -the modular inverse of $G$ and $tmpX$ is assigned the absolute value of $X$. The algorithm will recuse with these new values with a positive -exponent. - -If the exponent is positive the algorithm resumes the exponentiation. Line 63 determines if the modulus is of the restricted Diminished Radix -form. If it is not line 65 attempts to determine if it is of a unrestricted Diminished Radix form. The integer $dr$ will take on one -of three values. - -\begin{enumerate} -\item $dr = 0$ means that the modulus is not of either restricted or unrestricted Diminished Radix form. -\item $dr = 1$ means that the modulus is of restricted Diminished Radix form. -\item $dr = 2$ means that the modulus is of unrestricted Diminished Radix form. -\end{enumerate} - -Line 69 determines if the fast modular exponentiation algorithm can be used. It is allowed if $dr \ne 0$ or if the modulus is odd. Otherwise, -the slower s\_mp\_exptmod algorithm is used which uses Barrett reduction. - -\subsection{Barrett Modular Exponentiation} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_exptmod}. \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -1. $k \leftarrow lg(x)$ \\ -2. $winsize \leftarrow \left \lbrace \begin{array}{ll} - 2 & \mbox{if }k \le 7 \\ - 3 & \mbox{if }7 < k \le 36 \\ - 4 & \mbox{if }36 < k \le 140 \\ - 5 & \mbox{if }140 < k \le 450 \\ - 6 & \mbox{if }450 < k \le 1303 \\ - 7 & \mbox{if }1303 < k \le 3529 \\ - 8 & \mbox{if }3529 < k \\ - \end{array} \right .$ \\ -3. Initialize $2^{winsize}$ mp\_ints in an array named $M$ and one mp\_int named $\mu$ \\ -4. Calculate the $\mu$ required for Barrett Reduction (\textit{mp\_reduce\_setup}). \\ -5. $M_1 \leftarrow g \mbox{ (mod }p\mbox{)}$ \\ -\\ -Setup the table of small powers of $g$. First find $g^{2^{winsize}}$ and then all multiples of it. \\ -6. $k \leftarrow 2^{winsize - 1}$ \\ -7. $M_{k} \leftarrow M_1$ \\ -8. for $ix$ from 0 to $winsize - 2$ do \\ -\hspace{3mm}8.1 $M_k \leftarrow \left ( M_k \right )^2$ (\textit{mp\_sqr}) \\ -\hspace{3mm}8.2 $M_k \leftarrow M_k \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ -9. for $ix$ from $2^{winsize - 1} + 1$ to $2^{winsize} - 1$ do \\ -\hspace{3mm}9.1 $M_{ix} \leftarrow M_{ix - 1} \cdot M_{1}$ (\textit{mp\_mul}) \\ -\hspace{3mm}9.2 $M_{ix} \leftarrow M_{ix} \mbox{ (mod }p\mbox{)}$ (\textit{mp\_reduce}) \\ -10. $res \leftarrow 1$ \\ -\\ -Start Sliding Window. \\ -11. $mode \leftarrow 0, bitcnt \leftarrow 1, buf \leftarrow 0, digidx \leftarrow x.used - 1, bitcpy \leftarrow 0, bitbuf \leftarrow 0$ \\ -12. Loop \\ -\hspace{3mm}12.1 $bitcnt \leftarrow bitcnt - 1$ \\ -\hspace{3mm}12.2 If $bitcnt = 0$ then do \\ -\hspace{6mm}12.2.1 If $digidx = -1$ goto step 13. \\ -\hspace{6mm}12.2.2 $buf \leftarrow x_{digidx}$ \\ -\hspace{6mm}12.2.3 $digidx \leftarrow digidx - 1$ \\ -\hspace{6mm}12.2.4 $bitcnt \leftarrow lg(\beta)$ \\ -Continued on next page. \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_exptmod} -\end{figure} - -\newpage\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{s\_mp\_exptmod} (\textit{continued}). \\ -\textbf{Input}. mp\_int $a$, $b$ and $c$ \\ -\textbf{Output}. $y \equiv g^x \mbox{ (mod }p\mbox{)}$ \\ -\hline \\ -\hspace{3mm}12.3 $y \leftarrow (buf >> (lg(\beta) - 1))$ AND $1$ \\ -\hspace{3mm}12.4 $buf \leftarrow buf << 1$ \\ -\hspace{3mm}12.5 if $mode = 0$ and $y = 0$ then goto step 12. \\ -\hspace{3mm}12.6 if $mode = 1$ and $y = 0$ then do \\ -\hspace{6mm}12.6.1 $res \leftarrow res^2$ \\ -\hspace{6mm}12.6.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}12.6.3 Goto step 12. \\ -\hspace{3mm}12.7 $bitcpy \leftarrow bitcpy + 1$ \\ -\hspace{3mm}12.8 $bitbuf \leftarrow bitbuf + (y << (winsize - bitcpy))$ \\ -\hspace{3mm}12.9 $mode \leftarrow 2$ \\ -\hspace{3mm}12.10 If $bitcpy = winsize$ then do \\ -\hspace{6mm}Window is full so perform the squarings and single multiplication. \\ -\hspace{6mm}12.10.1 for $ix$ from $0$ to $winsize -1$ do \\ -\hspace{9mm}12.10.1.1 $res \leftarrow res^2$ \\ -\hspace{9mm}12.10.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}12.10.2 $res \leftarrow res \cdot M_{bitbuf}$ \\ -\hspace{6mm}12.10.3 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}Reset the window. \\ -\hspace{6mm}12.10.4 $bitcpy \leftarrow 0, bitbuf \leftarrow 0, mode \leftarrow 1$ \\ -\\ -No more windows left. Check for residual bits of exponent. \\ -13. If $mode = 2$ and $bitcpy > 0$ then do \\ -\hspace{3mm}13.1 for $ix$ form $0$ to $bitcpy - 1$ do \\ -\hspace{6mm}13.1.1 $res \leftarrow res^2$ \\ -\hspace{6mm}13.1.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -\hspace{6mm}13.1.3 $bitbuf \leftarrow bitbuf << 1$ \\ -\hspace{6mm}13.1.4 If $bitbuf$ AND $2^{winsize} \ne 0$ then do \\ -\hspace{9mm}13.1.4.1 $res \leftarrow res \cdot M_{1}$ \\ -\hspace{9mm}13.1.4.2 $res \leftarrow res \mbox{ (mod }p\mbox{)}$ \\ -14. $y \leftarrow res$ \\ -15. Clear $res$, $mu$ and the $M$ array. \\ -16. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm s\_mp\_exptmod (continued)} -\end{figure} - -\textbf{Algorithm s\_mp\_exptmod.} -This algorithm computes the $x$'th power of $g$ modulo $p$ and stores the result in $y$. It takes advantage of the Barrett reduction -algorithm to keep the product small throughout the algorithm. - -The first two steps determine the optimal window size based on the number of bits in the exponent. The larger the exponent the -larger the window size becomes. After a window size $winsize$ has been chosen an array of $2^{winsize}$ mp\_int variables is allocated. This -table will hold the values of $g^x \mbox{ (mod }p\mbox{)}$ for $2^{winsize - 1} \le x < 2^{winsize}$. - -After the table is allocated the first power of $g$ is found. Since $g \ge p$ is allowed it must be first reduced modulo $p$ to make -the rest of the algorithm more efficient. The first element of the table at $2^{winsize - 1}$ is found by squaring $M_1$ successively $winsize - 2$ -times. The rest of the table elements are found by multiplying the previous element by $M_1$ modulo $p$. - -Now that the table is available the sliding window may begin. The following list describes the functions of all the variables in the window. -\begin{enumerate} -\item The variable $mode$ dictates how the bits of the exponent are interpreted. -\begin{enumerate} - \item When $mode = 0$ the bits are ignored since no non-zero bit of the exponent has been seen yet. For example, if the exponent were simply - $1$ then there would be $lg(\beta) - 1$ zero bits before the first non-zero bit. In this case bits are ignored until a non-zero bit is found. - \item When $mode = 1$ a non-zero bit has been seen before and a new $winsize$-bit window has not been formed yet. In this mode leading $0$ bits - are read and a single squaring is performed. If a non-zero bit is read a new window is created. - \item When $mode = 2$ the algorithm is in the middle of forming a window and new bits are appended to the window from the most significant bit - downwards. -\end{enumerate} -\item The variable $bitcnt$ indicates how many bits are left in the current digit of the exponent left to be read. When it reaches zero a new digit - is fetched from the exponent. -\item The variable $buf$ holds the currently read digit of the exponent. -\item The variable $digidx$ is an index into the exponents digits. It starts at the leading digit $x.used - 1$ and moves towards the trailing digit. -\item The variable $bitcpy$ indicates how many bits are in the currently formed window. When it reaches $winsize$ the window is flushed and - the appropriate operations performed. -\item The variable $bitbuf$ holds the current bits of the window being formed. -\end{enumerate} - -All of step 12 is the window processing loop. It will iterate while there are digits available form the exponent to read. The first step -inside this loop is to extract a new digit if no more bits are available in the current digit. If there are no bits left a new digit is -read and if there are no digits left than the loop terminates. - -After a digit is made available step 12.3 will extract the most significant bit of the current digit and move all other bits in the digit -upwards. In effect the digit is read from most significant bit to least significant bit and since the digits are read from leading to -trailing edges the entire exponent is read from most significant bit to least significant bit. - -At step 12.5 if the $mode$ and currently extracted bit $y$ are both zero the bit is ignored and the next bit is read. This prevents the -algorithm from having to perform trivial squaring and reduction operations before the first non-zero bit is read. Step 12.6 and 12.7-10 handle -the two cases of $mode = 1$ and $mode = 2$ respectively. - -\begin{center} -\begin{figure}[here] -\includegraphics{pics/expt_state} -\caption{Sliding Window State Diagram} -\end{figure} -\end{center} - -By step 13 there are no more digits left in the exponent. However, there may be partial bits in the window left. If $mode = 2$ then -a Left-to-Right algorithm is used to process the remaining few bits. - -\index{bn\_s\_mp\_exptmod.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_s\_mp\_exptmod.c -\vspace{-3mm} -\begin{alltt} -016 -017 int -018 s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y) -019 \{ -020 mp_int M[256], res, mu; -021 mp_digit buf; -022 int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; -023 -024 /* find window size */ -025 x = mp_count_bits (X); -026 if (x <= 7) \{ -027 winsize = 2; -028 \} else if (x <= 36) \{ -029 winsize = 3; -030 \} else if (x <= 140) \{ -031 winsize = 4; -032 \} else if (x <= 450) \{ -033 winsize = 5; -034 \} else if (x <= 1303) \{ -035 winsize = 6; -036 \} else if (x <= 3529) \{ -037 winsize = 7; -038 \} else \{ -039 winsize = 8; -040 \} -041 -042 #ifdef MP_LOW_MEM -043 if (winsize > 5) \{ -044 winsize = 5; -045 \} -046 #endif -047 -048 /* init M array */ -049 for (x = 0; x < (1 << winsize); x++) \{ -050 if ((err = mp_init_size (&M[x], 1)) != MP_OKAY) \{ -051 for (y = 0; y < x; y++) \{ -052 mp_clear (&M[y]); -053 \} -054 return err; -055 \} -056 \} -057 -058 /* create mu, used for Barrett reduction */ -059 if ((err = mp_init (&mu)) != MP_OKAY) \{ -060 goto __M; -061 \} -062 if ((err = mp_reduce_setup (&mu, P)) != MP_OKAY) \{ -063 goto __MU; -064 \} -065 -066 /* create M table -067 * -068 * The M table contains powers of the base, -069 * e.g. M[x] = G**x mod P -070 * -071 * The first half of the table is not -072 * computed though accept for M[0] and M[1] -073 */ -074 if ((err = mp_mod (G, P, &M[1])) != MP_OKAY) \{ -075 goto __MU; -076 \} -077 -078 /* compute the value at M[1<<(winsize-1)] by squaring -079 * M[1] (winsize-1) times -080 */ -081 if ((err = mp_copy (&M[1], &M[1 << (winsize - 1)])) != MP_OKAY) \{ -082 goto __MU; -083 \} -084 -085 for (x = 0; x < (winsize - 1); x++) \{ -086 if ((err = mp_sqr (&M[1 << (winsize - 1)], -087 &M[1 << (winsize - 1)])) != MP_OKAY) \{ -088 goto __MU; -089 \} -090 if ((err = mp_reduce (&M[1 << (winsize - 1)], P, &mu)) != MP_OKAY) \{ -091 goto __MU; -092 \} -093 \} -094 -095 /* create upper table */ -096 for (x = (1 << (winsize - 1)) + 1; x < (1 << winsize); x++) \{ -097 if ((err = mp_mul (&M[x - 1], &M[1], &M[x])) != MP_OKAY) \{ -098 goto __MU; -099 \} -100 if ((err = mp_reduce (&M[x], P, &mu)) != MP_OKAY) \{ -101 goto __MU; -102 \} -103 \} -104 -105 /* setup result */ -106 if ((err = mp_init (&res)) != MP_OKAY) \{ -107 goto __MU; -108 \} -109 mp_set (&res, 1); -110 -111 /* set initial mode and bit cnt */ -112 mode = 0; -113 bitcnt = 1; -114 buf = 0; -115 digidx = X->used - 1; -116 bitcpy = 0; -117 bitbuf = 0; -118 -119 for (;;) \{ -120 /* grab next digit as required */ -121 if (--bitcnt == 0) \{ -122 if (digidx == -1) \{ -123 break; -124 \} -125 buf = X->dp[digidx--]; -126 bitcnt = (int) DIGIT_BIT; -127 \} -128 -129 /* grab the next msb from the exponent */ -130 y = (buf >> (mp_digit)(DIGIT_BIT - 1)) & 1; -131 buf <<= (mp_digit)1; -132 -133 /* if the bit is zero and mode == 0 then we ignore it -134 * These represent the leading zero bits before the first 1 bit -135 * in the exponent. Technically this opt is not required but it -136 * does lower the # of trivial squaring/reductions used -137 */ -138 if (mode == 0 && y == 0) -139 continue; -140 -141 /* if the bit is zero and mode == 1 then we square */ -142 if (mode == 1 && y == 0) \{ -143 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ -144 goto __RES; -145 \} -146 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ -147 goto __RES; -148 \} -149 continue; -150 \} -151 -152 /* else we add it to the window */ -153 bitbuf |= (y << (winsize - ++bitcpy)); -154 mode = 2; -155 -156 if (bitcpy == winsize) \{ -157 /* ok window is filled so square as required and multiply */ -158 /* square first */ -159 for (x = 0; x < winsize; x++) \{ -160 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ -161 goto __RES; -162 \} -163 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ -164 goto __RES; -165 \} -166 \} -167 -168 /* then multiply */ -169 if ((err = mp_mul (&res, &M[bitbuf], &res)) != MP_OKAY) \{ -170 goto __MU; -171 \} -172 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ -173 goto __MU; -174 \} -175 -176 /* empty window and reset */ -177 bitcpy = 0; -178 bitbuf = 0; -179 mode = 1; -180 \} -181 \} -182 -183 /* if bits remain then square/multiply */ -184 if (mode == 2 && bitcpy > 0) \{ -185 /* square then multiply if the bit is set */ -186 for (x = 0; x < bitcpy; x++) \{ -187 if ((err = mp_sqr (&res, &res)) != MP_OKAY) \{ -188 goto __RES; -189 \} -190 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ -191 goto __RES; -192 \} -193 -194 bitbuf <<= 1; -195 if ((bitbuf & (1 << winsize)) != 0) \{ -196 /* then multiply */ -197 if ((err = mp_mul (&res, &M[1], &res)) != MP_OKAY) \{ -198 goto __RES; -199 \} -200 if ((err = mp_reduce (&res, P, &mu)) != MP_OKAY) \{ -201 goto __RES; -202 \} -203 \} -204 \} -205 \} -206 -207 mp_exch (&res, Y); -208 err = MP_OKAY; -209 __RES:mp_clear (&res); -210 __MU:mp_clear (&mu); -211 __M: -212 for (x = 0; x < (1 << winsize); x++) \{ -213 mp_clear (&M[x]); -214 \} -215 return err; -216 \} -\end{alltt} -\end{small} - -Lines 26 through 40 determine the optimal window size based on the length of the exponent in bits. The window divisions are sorted -from smallest to greatest so that in each \textbf{if} statement only one condition must be tested. For example, by the \textbf{if} statement -on line 32 the value of $x$ is already known to be greater than $140$. - -The conditional piece of code beginning on line @42,define@ allows the window size to be restricted to five bits. This logic is used to ensure -the table of precomputed powers of $G$ remains relatively small. - -The for loop on line 49 initializes the $M$ array while lines 59 and 62 compute the value of $\mu$ required for -Barrett reduction. - --- More later. - -\section{Quick Power of Two} -Calculating $b = 2^a$ can be performed much quicker than with any of the previous algorithms. Recall that a logical shift left $m << k$ is -equivalent to $m \cdot 2^k$. By this logic when $m = 1$ a quick power of two can be achieved. - -\begin{figure}[!here] -\begin{small} -\begin{center} -\begin{tabular}{l} -\hline Algorithm \textbf{mp\_2expt}. \\ -\textbf{Input}. integer $b$ \\ -\textbf{Output}. $a \leftarrow 2^b$ \\ -\hline \\ -1. $a \leftarrow 0$ \\ -2. If $a.alloc < \lfloor b / lg(\beta) \rfloor + 1$ then grow $a$ appropriately. \\ -3. $a.used \leftarrow \lfloor b / lg(\beta) \rfloor + 1$ \\ -4. $a_{\lfloor b / lg(\beta) \rfloor} \leftarrow 1 << (b \mbox{ mod } lg(\beta))$ \\ -5. Return(\textit{MP\_OKAY}). \\ -\hline -\end{tabular} -\end{center} -\end{small} -\caption{Algorithm mp\_2expt} -\end{figure} - -\textbf{Algorithm mp\_2expt.} - -\index{bn\_mp\_2expt.c} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: bn\_mp\_2expt.c -\vspace{-3mm} -\begin{alltt} -016 -017 /* computes a = 2**b -018 * -019 * Simple algorithm which zeroes the int, grows it then just sets one bit -020 * as required. -021 */ -022 int -023 mp_2expt (mp_int * a, int b) -024 \{ -025 int res; -026 -027 mp_zero (a); -028 if ((res = mp_grow (a, b / DIGIT_BIT + 1)) != MP_OKAY) \{ -029 return res; -030 \} -031 a->used = b / DIGIT_BIT + 1; -032 a->dp[b / DIGIT_BIT] = 1 << (b % DIGIT_BIT); -033 -034 return MP_OKAY; -035 \} -\end{alltt} -\end{small} - -\chapter{Higher Level Algorithms} - -This chapter discusses the various higher level algorithms that are required to complete a well rounded multiple precision integer package. These -routines are less performance oriented than the algorithms of chapters five, six and seven but are no less important. - -The first section describes a method of integer division with remainder that is universally well known. It provides the signed division logic -for the package. The subsequent section discusses a set of algorithms which allow a single digit to be the 2nd operand for a variety of operations. -These algorithms serve mostly to simplify other algorithms where small constants are required. The last two sections discuss how to manipulate -various representations of integers. For example, converting from an mp\_int to a string of character. - -\section{Integer Division with Remainder} - -Integer division aside from modular exponentiation is most intensive algorithm to compute. - - -\section{Single Digit Helpers} -\subsection{Single Digit Addition} -\subsection{Single Digit Subtraction} -\subsection{Single Digit Multiplication} -\subsection{Single Digit Division} -\subsection{Single Digit Modulo} -\subsection{Single Digit Root Extraction} -\section{Random Number Generation} -\section{Formatted Output} -\subsection{Getting The Output Size} -\subsection{Generating Radix-n Output} -\subsection{Reading Radix-n Input} -\section{Unformatted Output} -\subsection{Getting The Output Size} -\subsection{Generating Output} -\subsection{Reading Input} - -\chapter{Number Theoretic Algorithms} -\section{Greatest Common Divisor} -\section{Least Common Multiple} -\section{Jacobi Symbol Computation} -\section{Modular Inverse} -\subsection{General Case} -\subsection{Odd Moduli} -\section{Primality Tests} -\subsection{Trial Division} -\subsection{The Fermat Test} -\subsection{The Miller-Rabin Test} -\subsection{Primality Test in a Bottle} -\subsection{The Next Prime} -\section{Root Extraction} - -\backmatter -\appendix -\begin{thebibliography}{ABCDEF} -\bibitem[1]{TAOCPV2} -Donald Knuth, \textit{The Art of Computer Programming}, Third Edition, Volume Two, Seminumerical Algorithms, Addison-Wesley, 1998 - -\bibitem[2]{HAC} -A. Menezes, P. van Oorschot, S. Vanstone, \textit{Handbook of Applied Cryptography}, CRC Press, 1996 - -\bibitem[3]{ROSE} -Michael Rosing, \textit{Implementing Elliptic Curve Cryptography}, Manning Publications, 1999 - -\bibitem[4]{COMBA} -Paul G. Comba, \textit{Exponentiation Cryptosystems on the IBM PC}. IBM Systems Journal 29(4): 526-538 (1990) - -\bibitem[5]{KARA} -A. Karatsuba, Doklay Akad. Nauk SSSR 145 (1962), pp.293-294 - -\bibitem[6]{KARAP} -Andre Weimerskirch and Christof Paar, \textit{Generalizations of the Karatsuba Algorithm for Polynomial Multiplication}, Submitted to Design, Codes and Cryptography, March 2002 - -\bibitem[7]{BARRETT} -Paul Barrett, \textit{Implementing the Rivest Shamir and Adleman Public Key Encryption Algorithm on a Standard Digital Signal Processor}, Advances in Cryptology, Crypto '86, Springer-Verlag. - -\bibitem[8]{MONT} -P.L.Montgomery. \textit{Modular multiplication without trial division}. Mathematics of Computation, 44(170):519-521, April 1985. - -\bibitem[9]{DRMET} -Chae Hoon Lim and Pil Joong Lee, \textit{Generating Efficient Primes for Discrete Log Cryptosystems}, POSTECH Information Research Laboratories - -\bibitem[10]{MMB} -J. Daemen and R. Govaerts and J. Vandewalle, \textit{Block ciphers based on Modular Arithmetic}, State and {P}rogress in the {R}esearch of {C}ryptography, 1993, pp. 80-89 - -\end{thebibliography} - -\input{tommath.ind} - -\chapter{Appendix} -\subsection*{Appendix A -- Source Listing of tommath.h} - -The following is the source listing of the header file ``tommath.h'' for the LibTomMath project. It contains many of -the definitions used throughout the code such as \textbf{mp\_int}, \textbf{MP\_PREC} and so on. The header is -presented here for completeness. - -\index{tommath.h} -\vspace{+3mm}\begin{small} -\hspace{-5.1mm}{\bf File}: tommath.h -\vspace{-3mm} -\begin{alltt} -001 /* LibTomMath, multiple-precision integer library -- Tom St Denis -002 * -003 * LibTomMath is library that provides for multiple-precision -004 * integer arithmetic as well as number theoretic functionality. -005 * -006 * The library is designed directly after the MPI library by -007 * Michael Fromberger but has been written from scratch with -008 * additional optimizations in place. -009 * -010 * The library is free for all purposes without any express -011 * guarantee it works. -012 * -013 * Tom St Denis, tomstdenis@iahu.ca, http://math.libtomcrypt.org -014 */ -015 #ifndef BN_H_ -016 #define BN_H_ -017 -018 #include -019 #include -020 #include -021 #include -022 #include -023 -024 #undef MIN -025 #define MIN(x,y) ((x)<(y)?(x):(y)) -026 #undef MAX -027 #define MAX(x,y) ((x)>(y)?(x):(y)) -028 -029 #ifdef __cplusplus -030 extern "C" \{ -031 -032 /* C++ compilers don't like assigning void * to mp_digit * */ -033 #define OPT_CAST (mp_digit *) -034 -035 #else -036 -037 /* C on the other hand doesn't care */ -038 #define OPT_CAST -039 -040 #endif -041 -042 /* some default configurations. -043 * -044 * A "mp_digit" must be able to hold DIGIT_BIT + 1 bits -045 * A "mp_word" must be able to hold 2*DIGIT_BIT + 1 bits -046 * -047 * At the very least a mp_digit must be able to hold 7 bits -048 * [any size beyond that is ok provided it doesn't overflow the data type] -049 */ -050 #ifdef MP_8BIT -051 typedef unsigned char mp_digit; -052 typedef unsigned short mp_word; -053 #elif defined(MP_16BIT) -054 typedef unsigned short mp_digit; -055 typedef unsigned long mp_word; -056 #elif defined(MP_64BIT) -057 /* for GCC only on supported platforms */ -058 #ifndef CRYPT -059 typedef unsigned long long ulong64; -060 typedef signed long long long64; -061 #endif -062 -063 typedef ulong64 mp_digit; -064 typedef unsigned long mp_word __attribute__ ((mode(TI))); -065 -066 #define DIGIT_BIT 60 -067 #else -068 /* this is the default case, 28-bit digits */ -069 -070 /* this is to make porting into LibTomCrypt easier :-) */ -071 #ifndef CRYPT -072 #if defined(_MSC_VER) || defined(__BORLANDC__) -073 typedef unsigned __int64 ulong64; -074 typedef signed __int64 long64; -075 #else -076 typedef unsigned long long ulong64; -077 typedef signed long long long64; -078 #endif -079 #endif -080 -081 typedef unsigned long mp_digit; -082 typedef ulong64 mp_word; -083 -084 #ifdef MP_31BIT -085 #define DIGIT_BIT 31 -086 #else -087 #define DIGIT_BIT 28 -088 #endif -089 #endif -090 -091 /* otherwise the bits per digit is calculated automatically from the size of - a mp_digit */ -092 #ifndef DIGIT_BIT -093 #define DIGIT_BIT ((CHAR_BIT * sizeof(mp_digit) - 1)) /* bits per di - git */ -094 #endif -095 -096 -097 #define MP_DIGIT_BIT DIGIT_BIT -098 #define MP_MASK ((((mp_digit)1)<<((mp_digit)DIGIT_BIT))-((mp_digit) - 1)) -099 #define MP_DIGIT_MAX MP_MASK -100 -101 /* equalities */ -102 #define MP_LT -1 /* less than */ -103 #define MP_EQ 0 /* equal to */ -104 #define MP_GT 1 /* greater than */ -105 -106 #define MP_ZPOS 0 /* positive integer */ -107 #define MP_NEG 1 /* negative */ -108 -109 #define MP_OKAY 0 /* ok result */ -110 #define MP_MEM -2 /* out of mem */ -111 #define MP_VAL -3 /* invalid input */ -112 #define MP_RANGE MP_VAL -113 -114 typedef int mp_err; -115 -116 /* you'll have to tune these... */ -117 extern int KARATSUBA_MUL_CUTOFF, -118 KARATSUBA_SQR_CUTOFF, -119 TOOM_MUL_CUTOFF, -120 TOOM_SQR_CUTOFF; -121 -122 /* various build options */ -123 #define MP_PREC 64 /* default digits of precision (must - be power of two) */ -124 -125 /* define this to use lower memory usage routines (exptmods mostly) */ -126 /* #define MP_LOW_MEM */ -127 -128 /* size of comba arrays, should be at least 2 * 2**(BITS_PER_WORD - BITS_PER - _DIGIT*2) */ -129 #define MP_WARRAY (1 << (sizeof(mp_word) * CHAR_BIT - 2 * DIGI - T_BIT + 1)) -130 -131 typedef struct \{ -132 int used, alloc, sign; -133 mp_digit *dp; -134 \} mp_int; -135 -136 #define USED(m) ((m)->used) -137 #define DIGIT(m,k) ((m)->dp[k]) -138 #define SIGN(m) ((m)->sign) -139 -140 /* ---> init and deinit bignum functions <--- */ -141 -142 /* init a bignum */ -143 int mp_init(mp_int *a); -144 -145 /* free a bignum */ -146 void mp_clear(mp_int *a); -147 -148 /* init a null terminated series of arguments */ -149 int mp_init_multi(mp_int *mp, ...); -150 -151 /* clear a null terminated series of arguments */ -152 void mp_clear_multi(mp_int *mp, ...); -153 -154 /* exchange two ints */ -155 void mp_exch(mp_int *a, mp_int *b); -156 -157 /* shrink ram required for a bignum */ -158 int mp_shrink(mp_int *a); -159 -160 /* grow an int to a given size */ -161 int mp_grow(mp_int *a, int size); -162 -163 /* init to a given number of digits */ -164 int mp_init_size(mp_int *a, int size); -165 -166 /* ---> Basic Manipulations <--- */ -167 -168 #define mp_iszero(a) (((a)->used == 0) ? 1 : 0) -169 #define mp_iseven(a) (((a)->used == 0 || (((a)->dp[0] & 1) == 0)) ? 1 : 0) -170 #define mp_isodd(a) (((a)->used > 0 && (((a)->dp[0] & 1) == 1)) ? 1 : 0) -171 -172 /* set to zero */ -173 void mp_zero(mp_int *a); -174 -175 /* set to a digit */ -176 void mp_set(mp_int *a, mp_digit b); -177 -178 /* set a 32-bit const */ -179 int mp_set_int(mp_int *a, unsigned int b); -180 -181 /* copy, b = a */ -182 int mp_copy(mp_int *a, mp_int *b); -183 -184 /* inits and copies, a = b */ -185 int mp_init_copy(mp_int *a, mp_int *b); -186 -187 /* trim unused digits */ -188 void mp_clamp(mp_int *a); -189 -190 /* ---> digit manipulation <--- */ -191 -192 /* right shift by "b" digits */ -193 void mp_rshd(mp_int *a, int b); -194 -195 /* left shift by "b" digits */ -196 int mp_lshd(mp_int *a, int b); -197 -198 /* c = a / 2**b */ -199 int mp_div_2d(mp_int *a, int b, mp_int *c, mp_int *d); -200 -201 /* b = a/2 */ -202 int mp_div_2(mp_int *a, mp_int *b); -203 -204 /* c = a * 2**b */ -205 int mp_mul_2d(mp_int *a, int b, mp_int *c); -206 -207 /* b = a*2 */ -208 int mp_mul_2(mp_int *a, mp_int *b); -209 -210 /* c = a mod 2**d */ -211 int mp_mod_2d(mp_int *a, int b, mp_int *c); -212 -213 /* computes a = 2**b */ -214 int mp_2expt(mp_int *a, int b); -215 -216 /* makes a pseudo-random int of a given size */ -217 int mp_rand(mp_int *a, int digits); -218 -219 /* ---> binary operations <--- */ -220 /* c = a XOR b */ -221 int mp_xor(mp_int *a, mp_int *b, mp_int *c); -222 -223 /* c = a OR b */ -224 int mp_or(mp_int *a, mp_int *b, mp_int *c); -225 -226 /* c = a AND b */ -227 int mp_and(mp_int *a, mp_int *b, mp_int *c); -228 -229 /* ---> Basic arithmetic <--- */ -230 -231 /* b = -a */ -232 int mp_neg(mp_int *a, mp_int *b); -233 -234 /* b = |a| */ -235 int mp_abs(mp_int *a, mp_int *b); -236 -237 /* compare a to b */ -238 int mp_cmp(mp_int *a, mp_int *b); -239 -240 /* compare |a| to |b| */ -241 int mp_cmp_mag(mp_int *a, mp_int *b); -242 -243 /* c = a + b */ -244 int mp_add(mp_int *a, mp_int *b, mp_int *c); -245 -246 /* c = a - b */ -247 int mp_sub(mp_int *a, mp_int *b, mp_int *c); -248 -249 /* c = a * b */ -250 int mp_mul(mp_int *a, mp_int *b, mp_int *c); -251 -252 /* b = a*a */ -253 int mp_sqr(mp_int *a, mp_int *b); -254 -255 /* a/b => cb + d == a */ -256 int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -257 -258 /* c = a mod b, 0 <= c < b */ -259 int mp_mod(mp_int *a, mp_int *b, mp_int *c); -260 -261 /* ---> single digit functions <--- */ -262 -263 /* compare against a single digit */ -264 int mp_cmp_d(mp_int *a, mp_digit b); -265 -266 /* c = a + b */ -267 int mp_add_d(mp_int *a, mp_digit b, mp_int *c); -268 -269 /* c = a - b */ -270 int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); -271 -272 /* c = a * b */ -273 int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); -274 -275 /* a/b => cb + d == a */ -276 int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); -277 -278 /* a/3 => 3c + d == a */ -279 int mp_div_3(mp_int *a, mp_int *c, mp_digit *d); -280 -281 /* c = a**b */ -282 int mp_expt_d(mp_int *a, mp_digit b, mp_int *c); -283 -284 /* c = a mod b, 0 <= c < b */ -285 int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c); -286 -287 /* ---> number theory <--- */ -288 -289 /* d = a + b (mod c) */ -290 int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -291 -292 /* d = a - b (mod c) */ -293 int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -294 -295 /* d = a * b (mod c) */ -296 int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -297 -298 /* c = a * a (mod b) */ -299 int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); -300 -301 /* c = 1/a (mod b) */ -302 int mp_invmod(mp_int *a, mp_int *b, mp_int *c); -303 -304 /* c = (a, b) */ -305 int mp_gcd(mp_int *a, mp_int *b, mp_int *c); -306 -307 /* c = [a, b] or (a*b)/(a, b) */ -308 int mp_lcm(mp_int *a, mp_int *b, mp_int *c); -309 -310 /* finds one of the b'th root of a, such that |c|**b <= |a| -311 * -312 * returns error if a < 0 and b is even -313 */ -314 int mp_n_root(mp_int *a, mp_digit b, mp_int *c); -315 -316 /* shortcut for square root */ -317 #define mp_sqrt(a, b) mp_n_root(a, 2, b) -318 -319 /* computes the jacobi c = (a | n) (or Legendre if b is prime) */ -320 int mp_jacobi(mp_int *a, mp_int *n, int *c); -321 -322 /* used to setup the Barrett reduction for a given modulus b */ -323 int mp_reduce_setup(mp_int *a, mp_int *b); -324 -325 /* Barrett Reduction, computes a (mod b) with a precomputed value c -326 * -327 * Assumes that 0 < a <= b*b, note if 0 > a > -(b*b) then you can merely -328 * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. -329 */ -330 int mp_reduce(mp_int *a, mp_int *b, mp_int *c); -331 -332 /* setups the montgomery reduction */ -333 int mp_montgomery_setup(mp_int *a, mp_digit *mp); -334 -335 /* computes a = B**n mod b without division or multiplication useful for -336 * normalizing numbers in a Montgomery system. -337 */ -338 int mp_montgomery_calc_normalization(mp_int *a, mp_int *b); -339 -340 /* computes x/R == x (mod N) via Montgomery Reduction */ -341 int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); -342 -343 /* returns 1 if a is a valid DR modulus */ -344 int mp_dr_is_modulus(mp_int *a); -345 -346 /* sets the value of "d" required for mp_dr_reduce */ -347 void mp_dr_setup(mp_int *a, mp_digit *d); -348 -349 /* reduces a modulo b using the Diminished Radix method */ -350 int mp_dr_reduce(mp_int *a, mp_int *b, mp_digit mp); -351 -352 /* returns true if a can be reduced with mp_reduce_2k */ -353 int mp_reduce_is_2k(mp_int *a); -354 -355 /* determines k value for 2k reduction */ -356 int mp_reduce_2k_setup(mp_int *a, mp_digit *d); -357 -358 /* reduces a modulo b where b is of the form 2**p - k [0 <= a] */ -359 int mp_reduce_2k(mp_int *a, mp_int *n, mp_digit k); -360 -361 /* d = a**b (mod c) */ -362 int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -363 -364 /* ---> Primes <--- */ -365 -366 /* number of primes */ -367 #ifdef MP_8BIT -368 #define PRIME_SIZE 31 -369 #else -370 #define PRIME_SIZE 256 -371 #endif -372 -373 /* table of first PRIME_SIZE primes */ -374 extern const mp_digit __prime_tab[]; -375 -376 /* result=1 if a is divisible by one of the first PRIME_SIZE primes */ -377 int mp_prime_is_divisible(mp_int *a, int *result); -378 -379 /* performs one Fermat test of "a" using base "b". -380 * Sets result to 0 if composite or 1 if probable prime -381 */ -382 int mp_prime_fermat(mp_int *a, mp_int *b, int *result); -383 -384 /* performs one Miller-Rabin test of "a" using base "b". -385 * Sets result to 0 if composite or 1 if probable prime -386 */ -387 int mp_prime_miller_rabin(mp_int *a, mp_int *b, int *result); -388 -389 /* performs t rounds of Miller-Rabin on "a" using the first -390 * t prime bases. Also performs an initial sieve of trial -391 * division. Determines if "a" is prime with probability -392 * of error no more than (1/4)**t. -393 * -394 * Sets result to 1 if probably prime, 0 otherwise -395 */ -396 int mp_prime_is_prime(mp_int *a, int t, int *result); -397 -398 /* finds the next prime after the number "a" using "t" trials -399 * of Miller-Rabin. -400 */ -401 int mp_prime_next_prime(mp_int *a, int t); -402 -403 -404 /* ---> radix conversion <--- */ -405 int mp_count_bits(mp_int *a); -406 -407 int mp_unsigned_bin_size(mp_int *a); -408 int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); -409 int mp_to_unsigned_bin(mp_int *a, unsigned char *b); -410 -411 int mp_signed_bin_size(mp_int *a); -412 int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); -413 int mp_to_signed_bin(mp_int *a, unsigned char *b); -414 -415 int mp_read_radix(mp_int *a, char *str, int radix); -416 int mp_toradix(mp_int *a, char *str, int radix); -417 int mp_radix_size(mp_int *a, int radix); -418 -419 int mp_fread(mp_int *a, int radix, FILE *stream); -420 int mp_fwrite(mp_int *a, int radix, FILE *stream); -421 -422 #define mp_read_raw(mp, str, len) mp_read_signed_bin((mp), (str), (len)) -423 #define mp_raw_size(mp) mp_signed_bin_size(mp) -424 #define mp_toraw(mp, str) mp_to_signed_bin((mp), (str)) -425 #define mp_read_mag(mp, str, len) mp_read_unsigned_bin((mp), (str), (len)) -426 #define mp_mag_size(mp) mp_unsigned_bin_size(mp) -427 #define mp_tomag(mp, str) mp_to_unsigned_bin((mp), (str)) -428 -429 #define mp_tobinary(M, S) mp_toradix((M), (S), 2) -430 #define mp_tooctal(M, S) mp_toradix((M), (S), 8) -431 #define mp_todecimal(M, S) mp_toradix((M), (S), 10) -432 #define mp_tohex(M, S) mp_toradix((M), (S), 16) -433 -434 /* lowlevel functions, do not call! */ -435 int s_mp_add(mp_int *a, mp_int *b, mp_int *c); -436 int s_mp_sub(mp_int *a, mp_int *b, mp_int *c); -437 #define s_mp_mul(a, b, c) s_mp_mul_digs(a, b, c, (a)->used + (b)->used + 1) -438 int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs); -439 int s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs); -440 int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs); -441 int s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs); -442 int fast_s_mp_sqr(mp_int *a, mp_int *b); -443 int s_mp_sqr(mp_int *a, mp_int *b); -444 int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c); -445 int mp_toom_mul(mp_int *a, mp_int *b, mp_int *c); -446 int mp_karatsuba_sqr(mp_int *a, mp_int *b); -447 int mp_toom_sqr(mp_int *a, mp_int *b); -448 int fast_mp_invmod(mp_int *a, mp_int *b, mp_int *c); -449 int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); -450 int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y, int mode); -451 int s_mp_exptmod (mp_int * G, mp_int * X, mp_int * P, mp_int * Y); -452 void bn_reverse(unsigned char *s, int len); -453 -454 #ifdef __cplusplus -455 \} -456 #endif -457 -458 #endif -459 -\end{alltt} -\end{small} - -\end{document} \ No newline at end of file