diff --git a/b.bat b/b.bat index 32dee86..606db8a 100644 --- a/b.bat +++ b/b.bat @@ -1,3 +1,2 @@ -nasm -f coff timer.asm -gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo -rem gcc -I./mtest/ -DU_MPI -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c mtest/mpi.c timer.o -o mpidemo +nasm -f elf timer.asm +gcc -Wall -W -O3 -fomit-frame-pointer -funroll-loops -DTIMER_X86 demo.c bn.c timer.o -o ltmdemo \ No newline at end of file diff --git a/bn.c b/bn.c index 040ff1c..8175c33 100644 --- a/bn.c +++ b/bn.c @@ -99,7 +99,8 @@ void dump_timings(void) memset(&functime, 0, sizeof(functime)); total = 0; for (x = 0; x < _itims; x++) { - total += timings[x].tot; + if (strcmp(timings[x].func, "_verify")) + total += timings[x].tot; /* try to find this entry */ for (y = 0; functime[y].func != NULL; y++) { @@ -1053,7 +1054,7 @@ static int fast_s_mp_mul_digs(mp_int *a, mp_int *b, mp_int *c, int digs) c->dp[digs-1] = (mp_digit)(W[digs-1] & ((mp_word)MP_MASK)); /* clear unused */ - for (ix = c->used; ix < olduse; ix++) { + for (; ix < olduse; ix++) { c->dp[ix] = 0; } @@ -1194,13 +1195,13 @@ static int fast_s_mp_mul_high_digs(mp_int *a, mp_int *b, mp_int *c, int digs) c->used = newused; /* now convert the array W downto what we need */ - for (ix = digs+1; ix < (pa+pb+1); ix++) { + for (ix = digs+1; ix < newused; ix++) { W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT)); c->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); } c->dp[(pa+pb+1)-1] = (mp_digit)(W[(pa+pb+1)-1] & ((mp_word)MP_MASK)); - for (ix = c->used; ix < oldused; ix++) { + for (; ix < oldused; ix++) { c->dp[ix] = 0; } mp_clamp(c); @@ -1339,17 +1340,17 @@ static int fast_s_mp_sqr(mp_int *a, mp_int *b) b->used = newused; /* now compute digits */ - for (ix = 1; ix < (pa+pa+1); ix++) { + for (ix = 1; ix < newused; ix++) { /* double/add next digit */ W[ix] += W[ix] + W2[ix]; W[ix] = W[ix] + (W[ix-1] >> ((mp_word)DIGIT_BIT)); b->dp[ix-1] = (mp_digit)(W[ix-1] & ((mp_word)MP_MASK)); } - b->dp[(pa+pa+1)-1] = (mp_digit)(W[(pa+pa+1)-1] & ((mp_word)MP_MASK)); + b->dp[(newused)-1] = (mp_digit)(W[(newused)-1] & ((mp_word)MP_MASK)); /* clear high */ - for (ix = b->used; ix < olduse; ix++) { + for (; ix < olduse; ix++) { b->dp[ix] = 0; } @@ -1580,9 +1581,7 @@ static int mp_karatsuba_mul(mp_int *a, mp_int *b, mp_int *c) } mp_clamp(&x0); - mp_clamp(&x1); mp_clamp(&y0); - mp_clamp(&y1); /* now calc the products x0y0 and x1y1 */ if (mp_mul(&x0, &y0, &x0y0) != MP_OKAY) goto X1Y1; /* x0y0 = x0*y0 */ @@ -1679,15 +1678,14 @@ static int mp_karatsuba_sqr(mp_int *a, mp_int *b) x1.used = a->used - B; mp_clamp(&x0); - mp_clamp(&x1); /* now calc the products x0*x0 and x1*x1 */ - if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ - if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ + if (mp_sqr(&x0, &x0x0) != MP_OKAY) goto X1X1; /* x0x0 = x0*x0 */ + if (mp_sqr(&x1, &x1x1) != MP_OKAY) goto X1X1; /* x1x1 = x1*x1 */ /* now calc x1-x0 and y1-y0 */ if (mp_sub(&x1, &x0, &t1) != MP_OKAY) goto X1X1; /* t1 = x1 - x0 */ - if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ + if (mp_sqr(&t1, &t1) != MP_OKAY) goto X1X1; /* t1 = (x1 - x0) * (y1 - y0) */ /* add x0y0 */ if (mp_add(&x0x0, &x1x1, &t2) != MP_OKAY) goto X1X1; /* t2 = x0y0 + x1y1 */ @@ -2760,8 +2758,7 @@ int mp_reduce_setup(mp_int *a, mp_int *b) VERIFY(a); VERIFY(b); - mp_set(a, 1); - if ((res = mp_lshd(a, b->used * 2)) != MP_OKAY) { + if ((res = mp_2expt(a, b->used * 2 * DIGIT_BIT)) != MP_OKAY) { DECFUNC(); return res; } @@ -2876,7 +2873,6 @@ __T: mp_clear(&t); return res; } - /* computes xR^-1 == x (mod N) via Montgomery Reduction (comba) */ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) { @@ -2884,29 +2880,53 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) mp_digit ui; mp_word W[512]; + REGFUNC("fast_mp_montgomery_reduce"); + VERIFY(a); + VERIFY(m); + /* get old used count */ olduse = a->used; /* grow a as required */ - if (a->alloc < m->used*2+1) { - if ((res = mp_grow(a, m->used*2+1)) != MP_OKAY) { + if (a->alloc < m->used+1) { + if ((res = mp_grow(a, m->used+1)) != MP_OKAY) { + DECFUNC(); return res; } } - /* copy and clear */ + /* copy the digits of a */ for (ix = 0; ix < a->used; ix++) { W[ix] = a->dp[ix]; } + + /* zero the high words */ for (; ix < m->used * 2 + 1; ix++) { W[ix] = 0; } - + for (ix = 0; ix < m->used; ix++) { - /* ui = ai * m' mod b */ + /* ui = ai * m' mod b + * + * We avoid a double precision multiplication (which isn't required) + * by casting the value down to a mp_digit. Note this requires that W[ix-1] have + * the carry cleared (see after the inner loop) + */ ui = (((mp_digit)(W[ix] & MP_MASK)) * mp) & MP_MASK; - /* a = a + ui * m * b^i */ + /* a = a + ui * m * b^i + * + * This is computed in place and on the fly. The multiplication + * by b^i is handled by offseting which columns the results + * are added to. + * + * Note the comba method normally doesn't handle carries in the inner loop + * In this case we fix the carry from the previous column since the Montgomery + * reduction requires digits of the result (so far) [see above] to work. This is + * handled by fixing up one carry after the inner loop. The carry fixups are done + * in order so after these loops the first m->used words of W[] have the carries + * fixed + */ { register int iy; register mp_digit *tmpx; @@ -2916,32 +2936,36 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) tmpx = m->dp; _W = W + ix; + /* inner loop */ for (iy = 0; iy < m->used; iy++) { *_W++ += ((mp_word)ui) * ((mp_word)*tmpx++); } - - /* now fix carry for W[ix+1] */ - W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT); - W[ix] &= ((mp_word)MP_MASK); } + + /* now fix carry for next digit, W[ix+1] */ + W[ix+1] += W[ix] >> ((mp_word)DIGIT_BIT); } /* nox fix rest of carries */ - for (; ix <= m->used * 2 + 1; ix++) { + for (++ix; ix <= m->used * 2 + 1; ix++) { W[ix] += (W[ix-1] >> ((mp_word)DIGIT_BIT)); - W[ix-1] &= ((mp_word)MP_MASK); } - /* copy out */ - - /* A = A/b^n */ + /* copy out, A = A/b^n + * + * The result is A/b^n but instead of converting from an array of mp_word + * to mp_digit than calling mp_rshd we just copy them in the right + * order + */ for (ix = 0; ix < m->used + 1; ix++) { - a->dp[ix] = W[ix+m->used]; + a->dp[ix] = W[ix+m->used] & ((mp_word)MP_MASK); } + /* set the max used */ a->used = m->used + 1; - /* zero oldused digits */ + /* zero oldused digits, if the input a was larger than + * m->used+1 we'll have to clear the digits */ for (; ix < olduse; ix++) { a->dp[ix] = 0; } @@ -2951,10 +2975,12 @@ static int fast_mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) /* if A >= m then A = A - m */ if (mp_cmp_mag(a, m) != MP_LT) { if ((res = s_mp_sub(a, m, a)) != MP_OKAY) { + DECFUNC(); return res; } - } + } + DECFUNC(); return MP_OKAY; } @@ -3036,7 +3062,7 @@ int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp) */ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) { - mp_int M[64], res; + mp_int M[256], res; mp_digit buf, mp; int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; @@ -3048,12 +3074,14 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) /* find window size */ x = mp_count_bits(X); - if (x <= 18) { winsize = 2; } - else if (x <= 84) { winsize = 3; } - else if (x <= 300) { winsize = 4; } - else if (x <= 930) { winsize = 5; } - else { winsize = 6; } - + if (x <= 7) { winsize = 2; } + else if (x <= 36) { winsize = 3; } + else if (x <= 140) { winsize = 4; } + else if (x <= 450) { winsize = 5; } + else if (x <= 1303) { winsize = 6; } + else if (x <= 3529) { winsize = 7; } + else { winsize = 8; } + /* init G array */ for (x = 0; x < (1<used)) != MP_OKAY) { + if ((err = mp_2expt(&res, P->used * DIGIT_BIT)) != MP_OKAY) { goto __RES; } - + /* res = R mod m */ if ((err = mp_mod(&res, P, &res)) != MP_OKAY) { goto __RES; @@ -3092,7 +3119,6 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) * * The first half of the table is not computed though accept for M[0] and M[1] */ - mp_set(&M[0], 1); if ((err = mp_mod(G, P, &M[1])) != MP_OKAY) { goto __RES; } @@ -3101,7 +3127,7 @@ static int mp_exptmod_fast(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) if ((err = mp_mulmod(&M[1], &res, P, &M[1])) != MP_OKAY) { goto __RES; } - + /* compute the value at M[1<<(winsize-1)] by squaring M[1] (winsize-1) times */ if ((err = mp_copy(&M[1], &M[1<<(winsize-1)])) != MP_OKAY) { goto __RES; @@ -3236,10 +3262,9 @@ __M : return err; } - int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) { - mp_int M[64], res, mu; + mp_int M[256], res, mu; mp_digit buf; int err, bitbuf, bitcpy, bitcnt, mode, digidx, x, y, winsize; @@ -3258,11 +3283,13 @@ int mp_exptmod(mp_int *G, mp_int *X, mp_int *P, mp_int *Y) /* find window size */ x = mp_count_bits(X); - if (x <= 18) { winsize = 2; } - else if (x <= 84) { winsize = 3; } - else if (x <= 300) { winsize = 4; } - else if (x <= 930) { winsize = 5; } - else { winsize = 6; } + if (x <= 7) { winsize = 2; } + else if (x <= 36) { winsize = 3; } + else if (x <= 140) { winsize = 4; } + else if (x <= 450) { winsize = 5; } + else if (x <= 1303) { winsize = 6; } + else if (x <= 3529) { winsize = 7; } + else { winsize = 8; } /* init G array */ for (x = 0; x < (1<used = b/DIGIT_BIT + 1; + a->dp[b/DIGIT_BIT] = 1 << (b % DIGIT_BIT); + + return MP_OKAY; +} + + /* find the n'th root of an integer * * Result found such that (c)^b <= a and (c+1)^b > a diff --git a/bn.h b/bn.h index 6e7bc85..5f39cbd 100644 --- a/bn.h +++ b/bn.h @@ -158,6 +158,9 @@ int mp_mul_2(mp_int *a, mp_int *b); /* c = a mod 2^d */ int mp_mod_2d(mp_int *a, int b, mp_int *c); +/* computes a = 2^b */ +int mp_2expt(mp_int *a, int b); + /* ---> Basic arithmetic <--- */ /* b = -a */ diff --git a/bn.pdf b/bn.pdf index f9c86f2..b8152e1 100644 Binary files a/bn.pdf and b/bn.pdf differ diff --git a/bn.tex b/bn.tex index d2aab27..5c8b73e 100644 --- a/bn.tex +++ b/bn.tex @@ -1,620 +1,635 @@ -\documentclass{article} -\begin{document} - -\title{LibTomMath v0.10 \\ A Free Multiple Precision Integer Library} -\author{Tom St Denis \\ tomstdenis@iahu.ca} -\maketitle -\newpage - -\section{Introduction} -``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis -of a public key cryptosystem. LibTomMath is written entire in portable ISO C source code and designed to have an application -interface much like that of MPI from Michael Fromberger. - -LibTomMath was written from scratch by Tom St Denis but designed to be drop in replacement for the MPI package. The -algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's -``The Art of Computer Programming''. The library has been extensively optimized and should provide quite comparable -timings as compared to many free and commercial libraries. - -LibTomMath was designed with the following goals in mind: -\begin{enumerate} -\item Be a drop in replacement for MPI. -\item Be much faster than MPI. -\item Be written entirely in portable C. -\end{enumerate} - -All three goals have been achieved. Particularly the speed increase goal. For example, a 512-bit modular exponentiation -is eight times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI. - -Being compatible with MPI means that applications that already use it can be ported fairly quickly. Currently there are -a few differences but there are many similarities. In fact the average MPI based application can be ported in under 15 -minutes. - -Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to -help debug and suggest optimizations. They were both of great help! - -\section{Building Against LibTomMath} - -Building against LibTomMath is very simple because there is only one source file. Simply add ``bn.c'' to your project and -copy both ``bn.c'' and ``bn.h'' into your project directory. There is no configuration nor building required before hand. - -If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them -with references to LibTomMath. For example, substitute - -\begin{verbatim} -#include "mpi.h" -\end{verbatim} - -with - -\begin{verbatim} -#include "bn.h" -\end{verbatim} - -Remove ``mpi.c'' from your project and replace it with ``bn.c''. - -\section{Programming with LibTomMath} - -\subsection{The mp\_int Structure} -All multiple precision integers are stored in a structure called \textbf{mp\_int}. A multiple precision integer is -essentially an array of \textbf{mp\_digit}. mp\_digit is defined at the top of bn.h. Its type can be changed to suit -a particular platform. - -For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds -seven bits. Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits. -By default a mp\_digit is a unsigned long and holds 28 bits. - -The choice of digit is particular to the platform at hand and what available multipliers are provided. For -MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal. When -MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal. By -default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal. - -This gives the library some flexibility. For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier. The -16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier. In practice this library is not particularly -designed for small devices like an i8051 due to the size. It is possible to strip out functions which are not required -to drop the code size. More realistically the library is well suited to 32 and 64-bit processors that have decent -integer multipliers. The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors. - -Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer. The -used member refers to how many digits are actually used in the representation of the integer. The alloc member refers -to how many digits have been allocated off the heap. There is also the $\beta$ quantity which is equal to $2^W$ where -$W$ is the number of bits in a digit (default is 28). - -\subsection{Calling Functions} -Most functions expect pointers to mp\_int's as parameters. To save on memory usage it is possible to have source -variables as destinations. For example: -\begin{verbatim} - mp_add(&x, &y, &x); /* x = x + y */ - mp_mul(&x, &z, &x); /* x = x * z */ - mp_div_2(&x, &x); /* x = x / 2 */ -\end{verbatim} - -\section{Quick Overview} - -\subsection{Basic Functionality} -Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A -function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if -it ran out of memory and \textbf{MP\_VAL} if the input was invalid. - -Before an mp\_int can be used it must be initialized with - -\begin{verbatim} -int mp_init(mp_int *a); -\end{verbatim} - -For example, consider the following. - -\begin{verbatim} -#include "bn.h" -int main(void) -{ - mp_int num; - if (mp_init(&num) != MP_OKAY) { - printf("Error initializing a mp_int.\n"); - } - return 0; -} -\end{verbatim} - -A mp\_int can be freed from memory with - -\begin{verbatim} -void mp_clear(mp_int *a); -\end{verbatim} - -This will zero the memory and free the allocated data. There are a set of trivial functions to manipulate the -value of an mp\_int. - -\begin{verbatim} -/* set to zero */ -void mp_zero(mp_int *a); - -/* set to a digit */ -void mp_set(mp_int *a, mp_digit b); - -/* set a 32-bit const */ -int mp_set_int(mp_int *a, unsigned long b); - -/* init to a given number of digits */ -int mp_init_size(mp_int *a, int size); - -/* copy, b = a */ -int mp_copy(mp_int *a, mp_int *b); - -/* inits and copies, a = b */ -int mp_init_copy(mp_int *a, mp_int *b); -\end{verbatim} - -The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive. The \textbf{mp\_set} function -will zero the integer and set the first digit to a value specified. The \textbf{mp\_set\_int} function will zero the -integer and set the first 32-bits to a given value. It is important to note that using mp\_set can have unintended -side effects when either the MP\_8BIT or MP\_16BIT defines are enabled. By default the library will accept the -ranges of values MPI will (and more). - -The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value. The -allocated digits are zero'ed by default but not marked as used. The \textbf{mp\_copy} function will copy the digits -(and sign) of the first parameter into the integer specified by the second parameter. The \textbf{mp\_init\_copy} will -initialize the first integer specified and copy the second one into it. Note that the order is reversed from that of -mp\_copy. This odd ``bug'' was kept to maintain compatibility with MPI. - -\subsection{Digit Manipulations} - -There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers -of two. - -\begin{verbatim} -/* right shift by "b" digits */ -void mp_rshd(mp_int *a, int b); - -/* left shift by "b" digits */ -int mp_lshd(mp_int *a, int b); - -/* c = a / 2^b */ -int mp_div_2d(mp_int *a, int b, mp_int *c); - -/* b = a/2 */ -int mp_div_2(mp_int *a, mp_int *b); - -/* c = a * 2^b */ -int mp_mul_2d(mp_int *a, int b, mp_int *c); - -/* b = a*2 */ -int mp_mul_2(mp_int *a, mp_int *b); - -/* c = a mod 2^d */ -int mp_mod_2d(mp_int *a, int b, mp_int *c); -\end{verbatim} - -\subsection{Basic Arithmetic} - -Next are the class of functions which provide basic arithmetic. - -\begin{verbatim} -/* b = -a */ -int mp_neg(mp_int *a, mp_int *b); - -/* b = |a| */ -int mp_abs(mp_int *a, mp_int *b); - -/* compare a to b */ -int mp_cmp(mp_int *a, mp_int *b); - -/* compare |a| to |b| */ -int mp_cmp_mag(mp_int *a, mp_int *b); - -/* c = a + b */ -int mp_add(mp_int *a, mp_int *b, mp_int *c); - -/* c = a - b */ -int mp_sub(mp_int *a, mp_int *b, mp_int *c); - -/* c = a * b */ -int mp_mul(mp_int *a, mp_int *b, mp_int *c); - -/* b = a^2 */ -int mp_sqr(mp_int *a, mp_int *b); - -/* a/b => cb + d == a */ -int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); - -/* c = a mod b, 0 <= c < b */ -int mp_mod(mp_int *a, mp_int *b, mp_int *c); -\end{verbatim} - -\subsection{Single Digit Functions} - -\begin{verbatim} -/* compare against a single digit */ -int mp_cmp_d(mp_int *a, mp_digit b); - -/* c = a + b */ -int mp_add_d(mp_int *a, mp_digit b, mp_int *c); - -/* c = a - b */ -int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); - -/* c = a * b */ -int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); - -/* a/b => cb + d == a */ -int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); - -/* c = a mod b, 0 <= c < b */ -int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c); -\end{verbatim} - -Note that care should be taken for the value of the digit passed. By default, any 28-bit integer is a valid digit that can -be passed into the function. However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers -can be passed into it. - -\subsection{Modular Arithmetic} - -There are some trivial modular arithmetic functions. - -\begin{verbatim} -/* d = a + b (mod c) */ -int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); - -/* d = a - b (mod c) */ -int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); - -/* d = a * b (mod c) */ -int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); - -/* c = a * a (mod b) */ -int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); - -/* c = 1/a (mod b) */ -int mp_invmod(mp_int *a, mp_int *b, mp_int *c); - -/* c = (a, b) */ -int mp_gcd(mp_int *a, mp_int *b, mp_int *c); - -/* c = [a, b] or (a*b)/(a, b) */ -int mp_lcm(mp_int *a, mp_int *b, mp_int *c); - -/* find the b'th root of a */ -int mp_n_root(mp_int *a, mp_digit b, mp_int *c); - -/* computes the jacobi c = (a | n) (or Legendre if b is prime) */ -int mp_jacobi(mp_int *a, mp_int *n, int *c); - -/* used to setup the Barrett reduction for a given modulus b */ -int mp_reduce_setup(mp_int *a, mp_int *b); - -/* Barrett Reduction, computes a (mod b) with a precomputed value c - * - * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely - * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. - */ -int mp_reduce(mp_int *a, mp_int *b, mp_int *c); - -/* setups the montgomery reduction */ -int mp_montgomery_setup(mp_int *a, mp_digit *mp); - -/* computes xR^-1 == x (mod N) via Montgomery Reduction */ -int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); - -/* d = a^b (mod c) */ -int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); -\end{verbatim} - -\subsection{Radix Conversions} -To read or store integers in other formats there are the following functions. - -\begin{verbatim} -int mp_unsigned_bin_size(mp_int *a); -int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); -int mp_to_unsigned_bin(mp_int *a, unsigned char *b); - -int mp_signed_bin_size(mp_int *a); -int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); -int mp_to_signed_bin(mp_int *a, unsigned char *b); - -int mp_read_radix(mp_int *a, unsigned char *str, int radix); -int mp_toradix(mp_int *a, unsigned char *str, int radix); -int mp_radix_size(mp_int *a, int radix); -\end{verbatim} - -The integers are stored in big endian format as most libraries (and MPI) expect. The \textbf{mp\_read\_radix} and -\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix. Valid values -for the radix are between 2 and 64 (inclusively). - -\section{Function Analysis} - -Throughout the function analysis the variable $N$ will denote the average size of an input to a function as measured -by the number of digits it has. The variable $W$ will denote the number of bits per word and $c$ will denote a small -constant amount of work. The big-oh notation will be abused slightly to consider numbers that do not grow to infinity. -That is we shall consider $O(N/2) \ne O(N)$ which is an abuse of the notation. - -\subsection{Digit Manipulation Functions} -The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\_lshd} and \textbf{mp\_mul\_2} are all -very simple functions to analyze. - -\subsubsection{mp\_rshd(mp\_int *a, int b)} -Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$. The work is performed -in-place which means the input and output are the same. If the shift count $b$ is less than or equal to zero -the function returns without doing any work. If the the shift count is larger than the number of digits in $a$ -then $a$ is simply zeroed without shifting digits. - -This function requires no additional memory and $O(N)$ time. - -\subsubsection{mp\_lshd(mp\_int *a, int b)} -Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$. The work -is performed in-place which means the input and output are the same. If the shift count $b$ is less than or equal -to zero the function returns success without doing any work. - -This function requires $O(b)$ additional digits of memory and $O(N)$ time. - -\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)} -Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$. The shifted number is stored -in the $c$ parameter. The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL). -If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success. - -This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time. - -\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)} -Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$. The shifted number -is placed in the $c$ parameter. If the shift count $b$ is less than or equal to zero the function places $a$ -in $c$ and returns success. - -This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time. - -\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)} -Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$. If the shift count $b$ is less than -or equal to zero the function places $a$ in $c$ and returns success. - -This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time. - -\subsection{Basic Arithmetic} - -\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)} -Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. - -This function requires no additional memory and $O(N)$ time. - -\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)} -Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. Note -that this comparison is unsigned which means it will report, for example, $-5 > 3$. By comparison mp\_cmp will -report $-5 < 3$. - -This function requires no additional memory and $O(N)$ time. - -\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)} -Computes $c = a + b$ using signed arithmetic. Handles the sign of the numbers which means it will subtract as -required, e.g. $a + -b$ turns into $a - b$. - -This function requires no additional memory and $O(N)$ time. - -\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)} -Computes $c = a - b$ using signed arithmetic. Handles the sign of the numbers which means it will add as -required, e.g. $a - -b$ turns into $a + b$. - -This function requires no additional memory and $O(N)$ time. - -\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)} -Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will -correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$. - -For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It -requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used -without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why -both are provided. The code will automatically determine when it can be used. If the digit count is higher -than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and -$O(N^{lg(3)})$ time. - -\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)} -Computes $b = a^2$. -For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It -requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used -without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately -$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time. - -\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} -Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign -of the output is not always positive. The sign of the remainder equals the sign of $a$ while the sign of the -quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$. Both $c$ and $d$ can be -optionally passed as NULL if the value is not desired. For example, if you want only the quotient of $x/y$ then -mp\_div(\&x, \&y, \&z, NULL) is acceptable. - -This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. - -\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)} -Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$. That is a normal -division is performed and if the remainder is negative $b$ is added to it. Since adding $b$ modulo $b$ is equivalent -to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate. The results are undefined -when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive. - -This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. - -\subsection{Number Theoretic Functions} - -\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod} -These functions take the time of their host function plus the time it takes to perform a division. For example, -mp\_addmod takes $O(N + 3 \cdot N^2)$ time. Note that if you are performing many modular operations in a row with -the same modulus you should consider Barrett reductions. - -Also note that these functions use mp\_mod which means the result are guaranteed to be positive. - -\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)} -This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When -$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. - -\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)} -Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive -or negative inputs. - -Functions requires no additional memory and approximately $O(N \cdot log(N))$ time. - -\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)} -Finds the least common multiple of both $a$ and $b$ and places the result in $c$. Will work with either positive -or negative inputs. This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of -both. - -Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time. - -\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)} -Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$. -Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration -requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$. - -If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root -that has a sign that agrees with the sign of $a$. - -\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)} -Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed -by $c$. Since the result of the Jacobi function $\left ( {a \over n} \right ) \in \lbrace -1, 0, 1 \rbrace$ it seemed -natural to store the result in a simple C style \textbf{int}. If $n$ is prime then the Jacobi function produces -the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}. This means if -$n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if -it is not. - -\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} -Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit -exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + k$ multiplications. The value of $k$ is -chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery -reductions are used to reduce the squared or multiplied temporary results modulo $c$. - -\subsection{Fast Modular Reductions} - -\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} -Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes -$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the -function mp\_reduce\_setup(). - -The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs -have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial -multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants. Barrett reduction -can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications. - -\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} -Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then -$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. -The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). - -The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine -is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can -reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. - -Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply -by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the -exptmod function can be made to take advantage of this such that only one normalization at the end is required. - -\section{Timing Analysis} -\subsection{Observed Timings} -A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The -test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build -options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 100,000 times -each while the modular exponentiation (exptmod) were performed 50 times each. The ``inversions'' refers to multiplicative -inversions modulo an odd number of a given size. The RDTSC (Read Time Stamp Counter) instruction was used to measure the -time the entire iterations took and was divided by the number of iterations to get an average. The following results -were observed. - -\begin{small} -\begin{center} -\begin{tabular}{c|c|c|c} -\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\ -\hline -Inversion & 128 & 264,083 & 59,782 \\ -Inversion & 256 & 549,370 & 146,915 \\ -Inversion & 512 & 1,675,975 & 367,172 \\ -Inversion & 1024 & 5,237,957 & 1,054,158 \\ -Inversion & 2048 & 17,871,944 & 3,459,683 \\ -Inversion & 4096 & 66,610,468 & 11,834,556 \\ -\hline -Multiply & 128 & 1,426 & 451 \\ -Multiply & 256 & 2,551 & 958 \\ -Multiply & 512 & 7,913 & 2,476 \\ -Multiply & 1024 & 28,496 & 7,927 \\ -Multiply & 2048 & 109,897 & 28,224 \\ -Multiply & 4096 & 469,970 & 101,171 \\ -\hline -Square & 128 & 1,319 & 511 \\ -Square & 256 & 1,776 & 947 \\ -Square & 512 & 5,399 & 2,153 \\ -Square & 1024 & 18,991 & 5,733 \\ -Square & 2048 & 72,126 & 17,621 \\ -Square & 4096 & 306,269 & 67,576 \\ -\hline -Exptmod & 512 & 32,021,586 & 3,118,435 \\ -Exptmod & 768 & 97,595,492 & 8,493,633 \\ -Exptmod & 1024 & 223,302,532 & 17,715,899 \\ -Exptmod & 2048 & 1,682,223,369 & 114,936,361 \\ -Exptmod & 2560 & 3,268,615,571 & 229,402,426 \\ -Exptmod & 3072 & 5,597,240,141 & 367,403,840 \\ -Exptmod & 4096 & 13,347,270,891 & 779,058,433 - -\end{tabular} -\end{center} -\end{small} - -Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to -get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit -squaring operation was not the same. The observed times were all approximately 2,500 cycles, more importantly they -were always faster than the timings observed with MPI by about the same magnitude. - -\subsection{Digit Size} -The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI -defualt of 16. This means in many of the algorithms the savings can be considerable. Consider a baseline multiplier -with a 1024-bit input. With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits. -A savings of $64^2 - 37^2 = 2727$ single precision multiplications. - -\subsection{Multiplication Algorithms} -For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI. There are two variants -of the baseline multiplier. The normal and the fast variants. The normal baseline multiplier is the exact same as the -algorithm from MPI. The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less -than or equal to $2^{w}/\beta^2$. Where $w$ is the number of bits in a \textbf{mp\_word}. By default a mp\_word is -64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits. - -The fast baseline multiplier is optimized by removing the carry operations from the inner loop. This is often referred -to as the ``comba'' method since it computes the products a columns first then figures out the carries. This has the -effect of making a very simple and paralizable inner loop. - -For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is -used. This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large -inputs this method is faster. The Karatsuba implementation is recursive which means for extremely large inputs they -will benefit from the algorithm. - -MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop. As a result even at -smaller numbers (below the Karatsuba cutoff) the LibTomMath multipliers are faster. - -\subsection{Squaring Algorithms} - -Similar to the multiplication algorithms there are two baseline squaring algorithms. Both have an asymptotic running -time of $O((t^2 + t)/2)$. The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring -algorithm. The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default -covers numbers upto $3584$ bits. - -There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large -inputs. - -MPI only implements the slower baseline squaring algorithm. As a result LibTomMath is considerably faster at squaring -than MPI is. - -\subsection{Exponentiation Algorithms} - -LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm. For a given exponent size $L$ an -appropriate window size $k$ is chosen. There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular -multiplications. The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base -$b$. Then the multiplications are grouped in windows of $k$ bits. The sliding window technique has the benefit -that it can skip multiplications if there are zero bits following or preceding a window. Consider the exponent -$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication -of $g(3)$, four squarings and and a multiplication by $g(1)$. In total there are 8 squarings and 3 multiplications. - -MPI uses a binary square-multiply method. For the same exponent $e$ it would have had 8 squarings and 5 multiplications. -There is a precomputation phase for the method LibTomMath uses but it generally cuts down considerably on the number -of multiplications. Consider a 512-bit exponent. The worst case for the LibTomMath method results in 512 squarings and -124 multiplications. The MPI method would have 512 squarings and 512 multiplications. Randomly every $2k$ bits another -multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides. - -Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given. -However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction -do not have to give full precision. As a result the reduction step is much faster and just as accurate. The LibTomMath code -will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used. The -faster multipliers have also been optimized into the two variants (baseline and comba baseline). - -As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI. - - - -\end{document} \ No newline at end of file +\documentclass{article} +\begin{document} + +\title{LibTomMath v0.11 \\ A Free Multiple Precision Integer Library} +\author{Tom St Denis \\ tomstdenis@iahu.ca} +\maketitle +\newpage + +\section{Introduction} +``LibTomMath'' is a free and open source library that provides multiple-precision integer functions required to form a basis +of a public key cryptosystem. LibTomMath is written entire in portable ISO C source code and designed to have an application +interface much like that of MPI from Michael Fromberger. + +LibTomMath was written from scratch by Tom St Denis but designed to be drop in replacement for the MPI package. The +algorithms within the library are derived from descriptions as provided in the Handbook of Applied Cryptography and Knuth's +``The Art of Computer Programming''. The library has been extensively optimized and should provide quite comparable +timings as compared to many free and commercial libraries. + +LibTomMath was designed with the following goals in mind: +\begin{enumerate} +\item Be a drop in replacement for MPI. +\item Be much faster than MPI. +\item Be written entirely in portable C. +\end{enumerate} + +All three goals have been achieved. Particularly the speed increase goal. For example, a 512-bit modular exponentiation +is eight times faster\footnote{On an Athlon XP with GCC 3.2} with LibTomMath compared to MPI. + +Being compatible with MPI means that applications that already use it can be ported fairly quickly. Currently there are +a few differences but there are many similarities. In fact the average MPI based application can be ported in under 15 +minutes. + +Thanks goes to Michael Fromberger for answering a couple questions and Colin Percival for having the patience and courtesy to +help debug and suggest optimizations. They were both of great help! + +\section{Building Against LibTomMath} + +Building against LibTomMath is very simple because there is only one source file. Simply add ``bn.c'' to your project and +copy both ``bn.c'' and ``bn.h'' into your project directory. There is no configuration nor building required before hand. + +If you are porting an MPI application to LibTomMath the first step will be to remove all references to MPI and replace them +with references to LibTomMath. For example, substitute + +\begin{verbatim} +#include "mpi.h" +\end{verbatim} + +with + +\begin{verbatim} +#include "bn.h" +\end{verbatim} + +Remove ``mpi.c'' from your project and replace it with ``bn.c''. + +\section{Programming with LibTomMath} + +\subsection{The mp\_int Structure} +All multiple precision integers are stored in a structure called \textbf{mp\_int}. A multiple precision integer is +essentially an array of \textbf{mp\_digit}. mp\_digit is defined at the top of bn.h. Its type can be changed to suit +a particular platform. + +For example, when \textbf{MP\_8BIT} is defined\footnote{When building bn.c.} a mp\_digit is a unsigned char and holds +seven bits. Similarly when \textbf{MP\_16BIT} is defined a mp\_digit is a unsigned short and holds 15 bits. +By default a mp\_digit is a unsigned long and holds 28 bits. + +The choice of digit is particular to the platform at hand and what available multipliers are provided. For +MP\_8BIT either a $8 \times 8 \Rightarrow 16$ or $16 \times 16 \Rightarrow 16$ multiplier is optimal. When +MP\_16BIT is defined either a $16 \times 16 \Rightarrow 32$ or $32 \times 32 \Rightarrow 32$ multiplier is optimal. By +default a $32 \times 32 \Rightarrow 64$ or $64 \times 64 \Rightarrow 64$ multiplier is optimal. + +This gives the library some flexibility. For example, a i8051 has a $8 \times 8 \Rightarrow 16$ multiplier. The +16-bit x86 instruction set has a $16 \times 16 \Rightarrow 32$ multiplier. In practice this library is not particularly +designed for small devices like an i8051 due to the size. It is possible to strip out functions which are not required +to drop the code size. More realistically the library is well suited to 32 and 64-bit processors that have decent +integer multipliers. The AMD Athlon XP and Intel Pentium 4 processors are examples of well suited processors. + +Throughout the discussions there will be references to a \textbf{used} and \textbf{alloc} members of an integer. The +used member refers to how many digits are actually used in the representation of the integer. The alloc member refers +to how many digits have been allocated off the heap. There is also the $\beta$ quantity which is equal to $2^W$ where +$W$ is the number of bits in a digit (default is 28). + +\subsection{Calling Functions} +Most functions expect pointers to mp\_int's as parameters. To save on memory usage it is possible to have source +variables as destinations. For example: +\begin{verbatim} + mp_add(&x, &y, &x); /* x = x + y */ + mp_mul(&x, &z, &x); /* x = x * z */ + mp_div_2(&x, &x); /* x = x / 2 */ +\end{verbatim} + +\section{Quick Overview} + +\subsection{Basic Functionality} +Essentially all LibTomMath functions return one of three values to indicate if the function worked as desired. A +function will return \textbf{MP\_OKAY} if the function was successful. A function will return \textbf{MP\_MEM} if +it ran out of memory and \textbf{MP\_VAL} if the input was invalid. + +Before an mp\_int can be used it must be initialized with + +\begin{verbatim} +int mp_init(mp_int *a); +\end{verbatim} + +For example, consider the following. + +\begin{verbatim} +#include "bn.h" +int main(void) +{ + mp_int num; + if (mp_init(&num) != MP_OKAY) { + printf("Error initializing a mp_int.\n"); + } + return 0; +} +\end{verbatim} + +A mp\_int can be freed from memory with + +\begin{verbatim} +void mp_clear(mp_int *a); +\end{verbatim} + +This will zero the memory and free the allocated data. There are a set of trivial functions to manipulate the +value of an mp\_int. + +\begin{verbatim} +/* set to zero */ +void mp_zero(mp_int *a); + +/* set to a digit */ +void mp_set(mp_int *a, mp_digit b); + +/* set a 32-bit const */ +int mp_set_int(mp_int *a, unsigned long b); + +/* init to a given number of digits */ +int mp_init_size(mp_int *a, int size); + +/* copy, b = a */ +int mp_copy(mp_int *a, mp_int *b); + +/* inits and copies, a = b */ +int mp_init_copy(mp_int *a, mp_int *b); +\end{verbatim} + +The \textbf{mp\_zero} function will clear the contents of a mp\_int and set it to positive. The \textbf{mp\_set} function +will zero the integer and set the first digit to a value specified. The \textbf{mp\_set\_int} function will zero the +integer and set the first 32-bits to a given value. It is important to note that using mp\_set can have unintended +side effects when either the MP\_8BIT or MP\_16BIT defines are enabled. By default the library will accept the +ranges of values MPI will (and more). + +The \textbf{mp\_init\_size} function will initialize the integer and set the allocated size to a given value. The +allocated digits are zero'ed by default but not marked as used. The \textbf{mp\_copy} function will copy the digits +(and sign) of the first parameter into the integer specified by the second parameter. The \textbf{mp\_init\_copy} will +initialize the first integer specified and copy the second one into it. Note that the order is reversed from that of +mp\_copy. This odd ``bug'' was kept to maintain compatibility with MPI. + +\subsection{Digit Manipulations} + +There are a class of functions that provide simple digit manipulations such as shifting and modulo reduction of powers +of two. + +\begin{verbatim} +/* right shift by "b" digits */ +void mp_rshd(mp_int *a, int b); + +/* left shift by "b" digits */ +int mp_lshd(mp_int *a, int b); + +/* c = a / 2^b */ +int mp_div_2d(mp_int *a, int b, mp_int *c); + +/* b = a/2 */ +int mp_div_2(mp_int *a, mp_int *b); + +/* c = a * 2^b */ +int mp_mul_2d(mp_int *a, int b, mp_int *c); + +/* b = a*2 */ +int mp_mul_2(mp_int *a, mp_int *b); + +/* c = a mod 2^d */ +int mp_mod_2d(mp_int *a, int b, mp_int *c); +\end{verbatim} + +\subsection{Basic Arithmetic} + +Next are the class of functions which provide basic arithmetic. + +\begin{verbatim} +/* b = -a */ +int mp_neg(mp_int *a, mp_int *b); + +/* b = |a| */ +int mp_abs(mp_int *a, mp_int *b); + +/* compare a to b */ +int mp_cmp(mp_int *a, mp_int *b); + +/* compare |a| to |b| */ +int mp_cmp_mag(mp_int *a, mp_int *b); + +/* c = a + b */ +int mp_add(mp_int *a, mp_int *b, mp_int *c); + +/* c = a - b */ +int mp_sub(mp_int *a, mp_int *b, mp_int *c); + +/* c = a * b */ +int mp_mul(mp_int *a, mp_int *b, mp_int *c); + +/* b = a^2 */ +int mp_sqr(mp_int *a, mp_int *b); + +/* a/b => cb + d == a */ +int mp_div(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c = a mod b, 0 <= c < b */ +int mp_mod(mp_int *a, mp_int *b, mp_int *c); +\end{verbatim} + +\subsection{Single Digit Functions} + +\begin{verbatim} +/* compare against a single digit */ +int mp_cmp_d(mp_int *a, mp_digit b); + +/* c = a + b */ +int mp_add_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a - b */ +int mp_sub_d(mp_int *a, mp_digit b, mp_int *c); + +/* c = a * b */ +int mp_mul_d(mp_int *a, mp_digit b, mp_int *c); + +/* a/b => cb + d == a */ +int mp_div_d(mp_int *a, mp_digit b, mp_int *c, mp_digit *d); + +/* c = a mod b, 0 <= c < b */ +int mp_mod_d(mp_int *a, mp_digit b, mp_digit *c); +\end{verbatim} + +Note that care should be taken for the value of the digit passed. By default, any 28-bit integer is a valid digit that can +be passed into the function. However, if MP\_8BIT or MP\_16BIT is defined only 7 or 15-bit (respectively) integers +can be passed into it. + +\subsection{Modular Arithmetic} + +There are some trivial modular arithmetic functions. + +\begin{verbatim} +/* d = a + b (mod c) */ +int mp_addmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a - b (mod c) */ +int mp_submod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* d = a * b (mod c) */ +int mp_mulmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); + +/* c = a * a (mod b) */ +int mp_sqrmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = 1/a (mod b) */ +int mp_invmod(mp_int *a, mp_int *b, mp_int *c); + +/* c = (a, b) */ +int mp_gcd(mp_int *a, mp_int *b, mp_int *c); + +/* c = [a, b] or (a*b)/(a, b) */ +int mp_lcm(mp_int *a, mp_int *b, mp_int *c); + +/* find the b'th root of a */ +int mp_n_root(mp_int *a, mp_digit b, mp_int *c); + +/* computes the jacobi c = (a | n) (or Legendre if b is prime) */ +int mp_jacobi(mp_int *a, mp_int *n, int *c); + +/* used to setup the Barrett reduction for a given modulus b */ +int mp_reduce_setup(mp_int *a, mp_int *b); + +/* Barrett Reduction, computes a (mod b) with a precomputed value c + * + * Assumes that 0 < a <= b^2, note if 0 > a > -(b^2) then you can merely + * compute the reduction as -1 * mp_reduce(mp_abs(a)) [pseudo code]. + */ +int mp_reduce(mp_int *a, mp_int *b, mp_int *c); + +/* setups the montgomery reduction */ +int mp_montgomery_setup(mp_int *a, mp_digit *mp); + +/* computes xR^-1 == x (mod N) via Montgomery Reduction */ +int mp_montgomery_reduce(mp_int *a, mp_int *m, mp_digit mp); + +/* d = a^b (mod c) */ +int mp_exptmod(mp_int *a, mp_int *b, mp_int *c, mp_int *d); +\end{verbatim} + +\subsection{Radix Conversions} +To read or store integers in other formats there are the following functions. + +\begin{verbatim} +int mp_unsigned_bin_size(mp_int *a); +int mp_read_unsigned_bin(mp_int *a, unsigned char *b, int c); +int mp_to_unsigned_bin(mp_int *a, unsigned char *b); + +int mp_signed_bin_size(mp_int *a); +int mp_read_signed_bin(mp_int *a, unsigned char *b, int c); +int mp_to_signed_bin(mp_int *a, unsigned char *b); + +int mp_read_radix(mp_int *a, unsigned char *str, int radix); +int mp_toradix(mp_int *a, unsigned char *str, int radix); +int mp_radix_size(mp_int *a, int radix); +\end{verbatim} + +The integers are stored in big endian format as most libraries (and MPI) expect. The \textbf{mp\_read\_radix} and +\textbf{mp\_toradix} functions read and write (respectively) null terminated ASCII strings in a given radix. Valid values +for the radix are between 2 and 64 (inclusively). + +\section{Function Analysis} + +Throughout the function analysis the variable $N$ will denote the average size of an input to a function as measured +by the number of digits it has. The variable $W$ will denote the number of bits per word and $c$ will denote a small +constant amount of work. The big-oh notation will be abused slightly to consider numbers that do not grow to infinity. +That is we shall consider $O(N/2) \ne O(N)$ which is an abuse of the notation. + +\subsection{Digit Manipulation Functions} +The class of digit manipulation functions such as \textbf{mp\_rshd}, \textbf{mp\_lshd} and \textbf{mp\_mul\_2} are all +very simple functions to analyze. + +\subsubsection{mp\_rshd(mp\_int *a, int b)} +Shifts $a$ by given number of digits to the right and is equivalent to dividing by $\beta^b$. The work is performed +in-place which means the input and output are the same. If the shift count $b$ is less than or equal to zero +the function returns without doing any work. If the the shift count is larger than the number of digits in $a$ +then $a$ is simply zeroed without shifting digits. + +This function requires no additional memory and $O(N)$ time. + +\subsubsection{mp\_lshd(mp\_int *a, int b)} +Shifts $a$ by a given number of digits to the left and is equivalent to multiplying by $\beta^b$. The work +is performed in-place which means the input and output are the same. If the shift count $b$ is less than or equal +to zero the function returns success without doing any work. + +This function requires $O(b)$ additional digits of memory and $O(N)$ time. + +\subsubsection{mp\_div\_2d(mp\_int *a, int b, mp\_int *c, mp\_int *d)} +Shifts $a$ by a given number of \textbf{bits} to the right and is equivalent to dividing by $2^b$. The shifted number is stored +in the $c$ parameter. The remainder of $a/2^b$ is optionally stored in $d$ (if it is not passed as NULL). +If the shift count $b$ is less than or equal to zero the function places $a$ in $c$ and returns success. + +This function requires $O(2 \cdot N)$ additional digits of memory and $O(2 \cdot N)$ time. + +\subsubsection{mp\_mul\_2d(mp\_int *a, int b, mp\_int *c)} +Shifts $a$ by a given number of bits to the left and is equivalent to multiplying by $2^b$. The shifted number +is placed in the $c$ parameter. If the shift count $b$ is less than or equal to zero the function places $a$ +in $c$ and returns success. + +This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time. + +\subsubsection{mp\_mod\_2d(mp\_int *a, int b, mp\_int *c)} +Performs the action of reducing $a$ modulo $2^b$ and stores the result in $c$. If the shift count $b$ is less than +or equal to zero the function places $a$ in $c$ and returns success. + +This function requires $O(N)$ additional digits of memory and $O(2 \cdot N)$ time. + +\subsection{Basic Arithmetic} + +\subsubsection{mp\_cmp(mp\_int *a, mp\_int *b)} +Performs a \textbf{signed} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. + +This function requires no additional memory and $O(N)$ time. + +\subsubsection{mp\_cmp\_mag(mp\_int *a, mp\_int *b)} +Performs a \textbf{unsigned} comparison between $a$ and $b$ returning \textbf{MP\_GT} is $a$ is larger than $b$. Note +that this comparison is unsigned which means it will report, for example, $-5 > 3$. By comparison mp\_cmp will +report $-5 < 3$. + +This function requires no additional memory and $O(N)$ time. + +\subsubsection{mp\_add(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes $c = a + b$ using signed arithmetic. Handles the sign of the numbers which means it will subtract as +required, e.g. $a + -b$ turns into $a - b$. + +This function requires no additional memory and $O(N)$ time. + +\subsubsection{mp\_sub(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes $c = a - b$ using signed arithmetic. Handles the sign of the numbers which means it will add as +required, e.g. $a - -b$ turns into $a + b$. + +This function requires no additional memory and $O(N)$ time. + +\subsubsection{mp\_mul(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes $c = a \cdot b$ using signed arithmetic. Handles the sign of the numbers correctly which means it will +correct the sign of the product as required, e.g. $a \cdot -b$ turns into $-ab$. + +For relatively small inputs, that is less than 80 digits a standard baseline or comba-baseline multiplier is used. It +requires no additional memory and $O(N^2)$ time. The comba-baseline multiplier is only used if it can safely be used +without losing carry digits. The comba method is faster than the baseline method but cannot always be used which is why +both are provided. The code will automatically determine when it can be used. If the digit count is higher +than 80 for the inputs than a Karatsuba multiplier is used which requires approximately $O(6 \cdot N)$ memory and +$O(N^{lg(3)})$ time. + +\subsubsection{mp\_sqr(mp\_int *a, mp\_int *b)} +Computes $b = a^2$. +For relatively small inputs, that is less than 80 digits a modified squaring or comba-squaring algorithm is used. It +requires no additional memory and $O((N^2 + N)/2)$ time. The comba-squaring method is used only if it can be safely used +without losing carry digits. After 80 digits a Karatsuba squaring algorithm is used whcih requires approximately +$O(4 \cdot N)$ memory and $O(N^{lg(3)})$ time. + +\subsubsection{mp\_div(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} +Computes $c = \lfloor a/b \rfloor$ and $d \equiv a \mbox{ (mod }b\mbox{)}$. The division is signed which means the sign +of the output is not always positive. The sign of the remainder equals the sign of $a$ while the sign of the +quotient equals the product of the ratios $(a/\vert a \vert) \cdot (b/\vert b \vert)$. Both $c$ and $d$ can be +optionally passed as NULL if the value is not desired. For example, if you want only the quotient of $x/y$ then +mp\_div(\&x, \&y, \&z, NULL) is acceptable. + +This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. + +\subsubsection{mp\_mod(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes $c \equiv a \mbox{ (mod }b\mbox{)}$ but with the added condition that $0 \le c < b$. That is a normal +division is performed and if the remainder is negative $b$ is added to it. Since adding $b$ modulo $b$ is equivalent +to adding zero ($0 \equiv b \mbox{ (mod }b\mbox{)}$) the result is accurate. The results are undefined +when $b \le 0$, in theory the routine will still give a properly congruent answer but it will not always be positive. + +This function requires $O(4 \cdot N)$ memory and $O(3 \cdot N^2)$ time. + +\subsection{Number Theoretic Functions} + +\subsubsection{mp\_addmod, mp\_submod, mp\_mulmod, mp\_sqrmod} +These functions take the time of their host function plus the time it takes to perform a division. For example, +mp\_addmod takes $O(N + 3 \cdot N^2)$ time. Note that if you are performing many modular operations in a row with +the same modulus you should consider Barrett reductions. + +Also note that these functions use mp\_mod which means the result are guaranteed to be positive. + +\subsubsection{mp\_invmod(mp\_int *a, mp\_int *b, mp\_int *c)} +This function will find $c = 1/a \mbox{ (mod }b\mbox{)}$ for any value of $a$ such that $(a, b) = 1$ and $b > 0$. When +$b$ is odd a ``fast'' variant is used which finds the inverse twice as fast. + +\subsubsection{mp\_gcd(mp\_int *a, mp\_int *b, mp\_int *c)} +Finds the greatest common divisor of both $a$ and $b$ and places the result in $c$. Will work with either positive +or negative inputs. + +Functions requires no additional memory and approximately $O(N \cdot log(N))$ time. + +\subsubsection{mp\_lcm(mp\_int *a, mp\_int *b, mp\_int *c)} +Finds the least common multiple of both $a$ and $b$ and places the result in $c$. Will work with either positive +or negative inputs. This is calculated by dividing the product of $a$ and $b$ by the greatest common divisor of +both. + +Functions requires no additional memory and approximately $O(4 \cdot N^2)$ time. + +\subsubsection{mp\_n\_root(mp\_int *a, mp\_digit b, mp\_int c)} +Finds the $b$'th root of $a$ and stores it in $b$. The roots are found such that $\vert c \vert^b \le \vert a \vert$. +Uses the Newton approximation approach which means it converges in $O(log \beta^N)$ time to a final result. Each iteration +requires $b$ multiplications and one division for a total work of $O(6N^2 \cdot log \beta^N) = O(6N^3 \cdot log \beta)$. + +If the input $a$ is negative and $b$ is even the function returns an error. Otherwise the function will return a root +that has a sign that agrees with the sign of $a$. + +\subsubsection{mp\_jacobi(mp\_int *a, mp\_int *n, int *c)} +Computes $c = \left ( {a \over n} \right )$ or the Jacobi function of $(a, n)$ and stores the result in an integer addressed +by $c$. Since the result of the Jacobi function $\left ( {a \over n} \right ) \in \lbrace -1, 0, 1 \rbrace$ it seemed +natural to store the result in a simple C style \textbf{int}. If $n$ is prime then the Jacobi function produces +the same results as the Legendre function\footnote{Source: Handbook of Applied Cryptography, pp. 73}. This means if +$n$ is prime then $\left ( {a \over n} \right )$ is equal to $1$ if $a$ is a quadratic residue modulo $n$ or $-1$ if +it is not. + +\subsubsection{mp\_exptmod(mp\_int *a, mp\_int *b, mp\_int *c, mp\_int *d)} +Computes $d = a^b \mbox{ (mod }c\mbox{)}$ using a sliding window $k$-ary exponentiation algorithm. For an $\alpha$-bit +exponent it performs $\alpha$ squarings and at most $\lfloor \alpha/k \rfloor + 2^{k-1}$ multiplications. The value of $k$ is +chosen to minimize the number of multiplications required for a given value of $\alpha$. Barrett or Montgomery +reductions are used to reduce the squared or multiplied temporary results modulo $c$. + +\subsection{Fast Modular Reductions} + +\subsubsection{mp\_reduce(mp\_int *a, mp\_int *b, mp\_int *c)} +Computes a Barrett reduction in-place of $a$ modulo $b$ with respect to $c$. In essence it computes +$a \equiv a \mbox{ (mod }b\mbox{)}$ provided $0 \le a \le b^2$. The value of $c$ is precomputed with the +function mp\_reduce\_setup(). The modulus $b$ must be larger than zero. + +The Barrett reduction function has been optimized to use partial multipliers which means compared to MPI it performs +have the number of single precision multipliers (\textit{provided they have the same size digits}). The partial +multipliers (\textit{one of which is shared with mp\_mul}) both have baseline and comba variants. Barrett reduction +can reduce a number modulo a $n-$digit modulus with approximately $2n^2$ single precision multiplications. + +\subsubsection{mp\_montgomery\_reduce(mp\_int *a, mp\_int *m, mp\_digit mp)} +Computes a Montgomery reduction in-place of $a$ modulo $b$ with respect to $mp$. If $b$ is some $n-$digit modulus then +$R = \beta^{n+1}$. The result of this function is $aR^{-1} \mbox{ (mod }b\mbox{)}$ provided that $0 \le a \le b^2$. +The value of $mp$ is precomputed with the function mp\_montgomery\_setup(). The modulus $b$ must be odd and larger +than zero. + +The Montgomery reduction comes in two variants. A standard baseline and a fast comba method. The baseline routine +is in fact slower than the Barrett reductions, however, the comba routine is much faster. Montomgery reduction can +reduce a number modulo a $n-$digit modulus with approximately $n^2 + n$ single precision multiplications. Compared +to Barrett reductions the montgomery reduction requires half as many multiplications as $n \rightarrow \infty$. + +Note that the final result of a Montgomery reduction is not just the value reduced modulo $b$. You have to multiply +by $R$ modulo $b$ to get the real result. At first that may not seem like such a worthwhile routine, however, the +exptmod function can be made to take advantage of this such that only one normalization at the end is required. + +This stems from the fact that if $a \rightarrow aR^{-1}$ through Montgomery reduction and if $a = vR$ and $b = uR$ then +$a^2 \rightarrow v^2R^2R^{-1} \equiv v^2R$ and $ab \rightarrow uvRRR^{-1} \equiv uvR$. The next useful observation is +that through the reduction $a \rightarrow vRR^{-1} \equiv v$ which means given a final result it can be normalized with +a single reduction. Now a series of complicated modular operations can be optimized if all the variables are initially +multiplied by $R$ then the final result normalized by performing an extra reduction. + +If many variables are to be normalized the simplest method to setup the variables is to first compute $\hat x \equiv R^2 \mbox{ mod }m$. +Now all the variables in the system can be multiplied by $\hat x$ and reduced with Montgomery reduction. This means that +two long divisions would be required to setup $\hat x$ and a multiplication followed by reduction for each variable. + +A very useful observation is that multiplying by $R = \beta^n$ amounts to performing a left shift by $n$ positions which +requires no single precision multiplications. + +\section{Timing Analysis} +\subsection{Observed Timings} +A simple test program ``demo.c'' was developed which builds with either MPI or LibTomMath (without modification). The +test was conducted on an AMD Athlon XP processor with 266Mhz DDR memory and the GCC 3.2 compiler\footnote{With build +options ``-O3 -fomit-frame-pointer -funroll-loops''}. The multiplications and squarings were repeated 100,000 times +each while the modular exponentiation (exptmod) were performed 50 times each. The ``inversions'' refers to multiplicative +inversions modulo an odd number of a given size. The RDTSC (Read Time Stamp Counter) instruction was used to measure the +time the entire iterations took and was divided by the number of iterations to get an average. The following results +were observed. + +\begin{small} +\begin{center} +\begin{tabular}{c|c|c|c} +\hline \textbf{Operation} & \textbf{Size (bits)} & \textbf{Time with MPI (cycles)} & \textbf{Time with LibTomMath (cycles)} \\ +\hline +Inversion & 128 & 264,083 & 59,782 \\ +Inversion & 256 & 549,370 & 146,915 \\ +Inversion & 512 & 1,675,975 & 367,172 \\ +Inversion & 1024 & 5,237,957 & 1,054,158 \\ +Inversion & 2048 & 17,871,944 & 3,459,683 \\ +Inversion & 4096 & 66,610,468 & 11,834,556 \\ +\hline +Multiply & 128 & 1,426 & 451 \\ +Multiply & 256 & 2,551 & 958 \\ +Multiply & 512 & 7,913 & 2,476 \\ +Multiply & 1024 & 28,496 & 7,927 \\ +Multiply & 2048 & 109,897 & 28,224 \\ +Multiply & 4096 & 469,970 & 101,171 \\ +\hline +Square & 128 & 1,319 & 511 \\ +Square & 256 & 1,776 & 947 \\ +Square & 512 & 5,399 & 2,153 \\ +Square & 1024 & 18,991 & 5,733 \\ +Square & 2048 & 72,126 & 17,621 \\ +Square & 4096 & 306,269 & 67,576 \\ +\hline +Exptmod & 512 & 32,021,586 & 3,118,435 \\ +Exptmod & 768 & 97,595,492 & 8,493,633 \\ +Exptmod & 1024 & 223,302,532 & 17,715,899 \\ +Exptmod & 2048 & 1,682,223,369 & 114,936,361 \\ +Exptmod & 2560 & 3,268,615,571 & 229,402,426 \\ +Exptmod & 3072 & 5,597,240,141 & 367,403,840 \\ +Exptmod & 4096 & 13,347,270,891 & 779,058,433 + +\end{tabular} +\end{center} +\end{small} + +Note that the figures do fluctuate but their magnitudes are relatively intact. The purpose of the chart is not to +get an exact timing but to compare the two libraries. For example, in all of the tests the exact time for a 512-bit +squaring operation was not the same. The observed times were all approximately 2,500 cycles, more importantly they +were always faster than the timings observed with MPI by about the same magnitude. + +\subsection{Digit Size} +The first major constribution to the time savings is the fact that 28 bits are stored per digit instead of the MPI +defualt of 16. This means in many of the algorithms the savings can be considerable. Consider a baseline multiplier +with a 1024-bit input. With MPI the input would be 64 16-bit digits whereas in LibTomMath it would be 37 28-bit digits. +A savings of $64^2 - 37^2 = 2727$ single precision multiplications. + +\subsection{Multiplication Algorithms} +For most inputs a typical baseline $O(n^2)$ multiplier is used which is similar to that of MPI. There are two variants +of the baseline multiplier. The normal and the fast variants. The normal baseline multiplier is the exact same as the +algorithm from MPI. The fast baseline multiplier is optimized for cases where the number of input digits $N$ is less +than or equal to $2^{w}/\beta^2$. Where $w$ is the number of bits in a \textbf{mp\_word}. By default a mp\_word is +64-bits which means $N \le 256$ is allowed which represents numbers upto $7168$ bits. + +The fast baseline multiplier is optimized by removing the carry operations from the inner loop. This is often referred +to as the ``comba'' method since it computes the products a columns first then figures out the carries. This has the +effect of making a very simple and paralizable inner loop. + +For large inputs, typically 80 digits\footnote{By default that is 2240-bits or more.} or more the Karatsuba method is +used. This method has significant overhead but an asymptotic running time of $O(n^{1.584})$ which means for fairly large +inputs this method is faster. The Karatsuba implementation is recursive which means for extremely large inputs they +will benefit from the algorithm. + +MPI only implements the slower baseline multiplier where carries are dealt with in the inner loop. As a result even at +smaller numbers (below the Karatsuba cutoff) the LibTomMath multipliers are faster. + +\subsection{Squaring Algorithms} + +Similar to the multiplication algorithms there are two baseline squaring algorithms. Both have an asymptotic running +time of $O((t^2 + t)/2)$. The normal baseline squaring is the same from MPI and the fast is a ``comba'' squaring +algorithm. The comba method is used if the number of digits $N$ is less than $2^{w-1}/\beta^2$ which by default +covers numbers upto $3584$ bits. + +There is also a Karatsuba squaring method which achieves a running time of $O(n^{1.584})$ after considerably large +inputs. + +MPI only implements the slower baseline squaring algorithm. As a result LibTomMath is considerably faster at squaring +than MPI is. + +\subsection{Exponentiation Algorithms} + +LibTomMath implements a sliding window $k$-ary left to right exponentiation algorithm. For a given exponent size $L$ an +appropriate window size $k$ is chosen. There are always at most $L$ modular squarings and $\lfloor L/k \rfloor$ modular +multiplications. The $k$-ary method works by precomputing values $g(x) = b^x$ for $0 \le x < 2^k$ and a given base +$b$. Then the multiplications are grouped in windows of $k$ bits. The sliding window technique has the benefit +that it can skip multiplications if there are zero bits following or preceding a window. Consider the exponent +$e = 11110001_2$ if $k = 2$ then there will be a two squarings, a multiplication of $g(3)$, two squarings, a multiplication +of $g(3)$, four squarings and and a multiplication by $g(1)$. In total there are 8 squarings and 3 multiplications. + +MPI uses a binary square-multiply method. For the same exponent $e$ it would have had 8 squarings and 5 multiplications. +There is a precomputation phase for the method LibTomMath uses but it generally cuts down considerably on the number +of multiplications. Consider a 512-bit exponent. The worst case for the LibTomMath method results in 512 squarings and +124 multiplications. The MPI method would have 512 squarings and 512 multiplications. Randomly every $2k$ bits another +multiplication is saved via the sliding-window technique on top of the savings the $k$-ary method provides. + +Both LibTomMath and MPI use Barrett reduction instead of division to reduce the numbers modulo the modulus given. +However, LibTomMath can take advantage of the fact that the multiplications required within the Barrett reduction +do not have to give full precision. As a result the reduction step is much faster and just as accurate. The LibTomMath code +will automatically determine at run-time (e.g. when its called) whether the faster multiplier can be used. The +faster multipliers have also been optimized into the two variants (baseline and comba baseline). + +As a result of all these changes exponentiation in LibTomMath is much faster than compared to MPI. + + + +\end{document} diff --git a/changes.txt b/changes.txt index d302ee6..e78a2a3 100644 --- a/changes.txt +++ b/changes.txt @@ -1,70 +1,76 @@ -Jan 9th, 2003 -v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code. - -- Added baseline montgomery and comba montgomery reductions, sped up exptmods - [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF] - -Jan 6th, 2003 -v0.09 -- Updated the manual to reflect recent changes. :-) - -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib - -- Added a Mersenne prime finder demo in ./etc/mersenne.c - -Jan 2nd, 2003 -v0.08 -- Sped up the multipliers by moving the inner loop variables into a smaller scope - -- Corrected a bunch of small "warnings" - -- Added more comments - -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data - -- Corrected some bugs where error messages were potentially ignored - -- add etc/pprime.c program which makes numbers which are provably prime. - -Jan 1st, 2003 -v0.07 -- Removed alot of heap operations from core functions to speed them up - -- Added a root finding function [and mp_sqrt macro like from MPI] - -- Added more to manual - -Dec 31st, 2002 -v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc... - -- Cleaned up the header a bit more - -Dec 30th, 2002 -v0.05 -- Builds with MSVC out of the box - -- Fixed a bug in mp_invmod w.r.t. even moduli - -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays - -- Fixed up exptmod to use fewer multiplications - -- Fixed up mp_init_size to use only one heap operation - -- Note there is a slight "off-by-one" bug in the library somewhere - without the padding (see the source for comment) the library - crashes in libtomcrypt. Anyways a reasonable workaround is to pad the - numbers which will always correct it since as the numbers grow the padding - will still be beyond the end of the number - -- Added more to the manual - -Dec 29th, 2002 -v0.04 -- Fixed a memory leak in mp_to_unsigned_bin - -- optimized invmod code - -- Fixed bug in mp_div - -- use exchange instead of copy for results - -- added a bit more to the manual - -Dec 27th, 2002 -v0.03 -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits - -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member. - -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly - -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work - -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs - -- mp_mul_d didn't preserve sign - -- Many many many many fixes - -- Works in LibTomCrypt now :-) - -- Added iterations to the timing demos... more accurate. - -- Tom needs a job. - -Dec 26th, 2002 -v0.02 -- Fixed a few "slips" in the manual. This is "LibTomMath" afterall :-) - -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing. - -- Sped up the fast [comba] multipliers more [yahoo!] - -Dec 25th,2002 -v0.01 -- Initial release. Gimme a break. - -- Todo list, - add details to manual [e.g. algorithms] - more comments in code - example programs \ No newline at end of file +Jan 15th, 2003 +v0.11 -- More subtle fixes + -- Moved to gentoo linux [hurrah!] so made *nix specific fixes to the make process + -- Sped up the montgomery reduction code quite a bit + -- fixed up demo so when building timing for the x86 it assumes ELF format now + +Jan 9th, 2003 +v0.10 -- Pekka Riikonen suggested fixes to the radix conversion code. + -- Added baseline montgomery and comba montgomery reductions, sped up exptmods + [to a point, see bn.h for MONTGOMERY_EXPT_CUTOFF] + +Jan 6th, 2003 +v0.09 -- Updated the manual to reflect recent changes. :-) + -- Added Jacobi function (mp_jacobi) to supplement the number theory side of the lib + -- Added a Mersenne prime finder demo in ./etc/mersenne.c + +Jan 2nd, 2003 +v0.08 -- Sped up the multipliers by moving the inner loop variables into a smaller scope + -- Corrected a bunch of small "warnings" + -- Added more comments + -- Made "mtest" be able to use /dev/random, /dev/urandom or stdin for RNG data + -- Corrected some bugs where error messages were potentially ignored + -- add etc/pprime.c program which makes numbers which are provably prime. + +Jan 1st, 2003 +v0.07 -- Removed alot of heap operations from core functions to speed them up + -- Added a root finding function [and mp_sqrt macro like from MPI] + -- Added more to manual + +Dec 31st, 2002 +v0.06 -- Sped up the s_mp_add, s_mp_sub which inturn sped up mp_invmod, mp_exptmod, etc... + -- Cleaned up the header a bit more + +Dec 30th, 2002 +v0.05 -- Builds with MSVC out of the box + -- Fixed a bug in mp_invmod w.r.t. even moduli + -- Made mp_toradix and mp_read_radix use char instead of unsigned char arrays + -- Fixed up exptmod to use fewer multiplications + -- Fixed up mp_init_size to use only one heap operation + -- Note there is a slight "off-by-one" bug in the library somewhere + without the padding (see the source for comment) the library + crashes in libtomcrypt. Anyways a reasonable workaround is to pad the + numbers which will always correct it since as the numbers grow the padding + will still be beyond the end of the number + -- Added more to the manual + +Dec 29th, 2002 +v0.04 -- Fixed a memory leak in mp_to_unsigned_bin + -- optimized invmod code + -- Fixed bug in mp_div + -- use exchange instead of copy for results + -- added a bit more to the manual + +Dec 27th, 2002 +v0.03 -- Sped up s_mp_mul_high_digs by not computing the carries of the lower digits + -- Fixed a bug where mp_set_int wouldn't zero the value first and set the used member. + -- fixed a bug in s_mp_mul_high_digs where the limit placed on the result digits was not calculated properly + -- fixed bugs in add/sub/mul/sqr_mod functions where if the modulus and dest were the same it wouldn't work + -- fixed a bug in mp_mod and mp_mod_d concerning negative inputs + -- mp_mul_d didn't preserve sign + -- Many many many many fixes + -- Works in LibTomCrypt now :-) + -- Added iterations to the timing demos... more accurate. + -- Tom needs a job. + +Dec 26th, 2002 +v0.02 -- Fixed a few "slips" in the manual. This is "LibTomMath" afterall :-) + -- Added mp_cmp_mag, mp_neg, mp_abs and mp_radix_size that were missing. + -- Sped up the fast [comba] multipliers more [yahoo!] + +Dec 25th,2002 +v0.01 -- Initial release. Gimme a break. + -- Todo list, + add details to manual [e.g. algorithms] + more comments in code + example programs diff --git a/demo.c b/demo.c index ab92707..f482120 100644 --- a/demo.c +++ b/demo.c @@ -19,8 +19,10 @@ #ifdef TIMER_X86 #define TIMER -extern ulong64 rdtsc(void); -extern void reset(void); +extern ulong64 _rdtsc(void); +extern void _reset(void); +ulong64 rdtsc(void) { return _rdtsc(); } +void reset(void) { _reset(); } #endif #ifdef TIMER @@ -85,7 +87,6 @@ int main(void) mp_int a, b, c, d, e, f; unsigned long expt_n, add_n, sub_n, mul_n, div_n, sqr_n, mul2d_n, div2d_n, gcd_n, lcm_n, inv_n; int rr; - mp_digit tom; #ifdef TIMER int n; @@ -99,42 +100,33 @@ int main(void) mp_init(&e); mp_init(&f); - mp_read_radix(&a, "59994534535345535344389423", 10); - mp_read_radix(&b, "49993453555234234565675534", 10); - mp_read_radix(&c, "62398923474472948723847281", 10); - - mp_mulmod(&a, &b, &c, &f); - - /* setup mont */ - mp_montgomery_setup(&c, &tom); - mp_mul(&a, &b, &a); - mp_montgomery_reduce(&a, &c, tom); - mp_montgomery_reduce(&a, &c, tom); - mp_lshd(&a, c.used*2); - mp_mod(&a, &c, &a); - - mp_toradix(&a, cmd, 10); - printf("%s\n\n", cmd); - mp_toradix(&f, cmd, 10); - printf("%s\n", cmd); - -/* return 0; */ - - - mp_read_radix(&a, "V//////////////////////////////////////////////////////////////////////////////////////", 64); - mp_reduce_setup(&b, &a); - printf("\n\n----\n\n"); - mp_toradix(&b, buf, 10); - printf("b == %s\n\n\n", buf); - - mp_read_radix(&b, "4982748972349724892742", 10); - mp_sub_d(&a, 1, &c); - mp_exptmod(&b, &c, &a, &d); - mp_toradix(&d, buf, 10); - printf("b^p-1 == %s\n", buf); - +#ifdef DEBUG + mp_read_radix(&a, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136319", 10); + mp_read_radix(&b, "347743159439876626079252796797422223177535447388206607607181663903045907591201940478223621722118173270898487582987137708656414344685816179420855160986340457973820182883508387588163122354089264395604796675278966117567294812714812796820596564876450716066283126720010859041484786529056457896367683122960411136318", 10); + mp_set(&c, 1); + reset_timings(); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + mp_exptmod(&c, &b, &a, &d); + dump_timings(); + return 0; +#endif #ifdef TIMER +goto expt; mp_read_radix(&a, "340282366920938463463374607431768211455", 10); mp_read_radix(&b, "340282366920938463463574607431768211455", 10); while (a.used * DIGIT_BIT < 8192) { @@ -182,7 +174,7 @@ int main(void) printf("Multiplying %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100000)); mp_copy(&b, &a); } - +expt: { char *primes[] = { "17933601194860113372237070562165128350027320072176844226673287945873370751245439587792371960615073855669274087805055507977323024886880985062002853331424203", @@ -206,7 +198,7 @@ int main(void) mp_mod(&b, &c, &b); mp_set(&c, 3); reset(); - for (rr = 0; rr < 35; rr++) { + for (rr = 0; rr < 100; rr++) { mp_exptmod(&c, &b, &a, &d); } tt = rdtsc(); @@ -219,7 +211,7 @@ int main(void) draw(&d); exit(0); } - printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)35)); + printf("Exponentiating %d-bit took %llu cycles\n", mp_count_bits(&a), tt / ((ulong64)100)); } } diff --git a/makefile b/makefile index edaf773..7567d22 100644 --- a/makefile +++ b/makefile @@ -1,13 +1,13 @@ CC = gcc -CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops +CFLAGS += -Wall -W -Wshadow -ansi -O3 -fomit-frame-pointer -funroll-loops -VERSION=0.10 +VERSION=0.11 default: test test: bn.o demo.o $(CC) bn.o demo.o -o demo - cd mtest ; gcc $(CFLAGS) mtest.c -o mtest.exe -s + cd mtest ; gcc $(CFLAGS) mtest.c -o mtest -s # builds the x86 demo test86: @@ -22,9 +22,9 @@ docs: docdvi rm -f bn.log bn.aux bn.dvi clean: - rm -f *.pdf *.o *.exe mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.s + rm -f *.pdf *.o *.exe demo mtest/mtest mtest/*.exe etc/*.exe bn.log bn.aux bn.dvi *.log *.s etc/pprime etc/mersenne zipup: clean docs - chdir .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ + cd .. ; rm -rf ltm* libtommath-$(VERSION) ; mkdir libtommath-$(VERSION) ; \ cp -R ./libtommath/* ./libtommath-$(VERSION)/ ; tar -c libtommath-$(VERSION)/* > ltm-$(VERSION).tar ; \ - bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* \ No newline at end of file + bzip2 -9vv ltm-$(VERSION).tar ; zip -9 -r ltm-$(VERSION).zip libtommath-$(VERSION)/* diff --git a/timer.asm b/timer.asm index 2393250..b317e3e 100644 --- a/timer.asm +++ b/timer.asm @@ -1,34 +1,34 @@ -; Simple RDTSC reader for NASM -; -; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever] -; -; Most *nix installs use elf so it would be "nasm -f elf timer.asm" -; -; Tom St Denis -[bits 32] -[section .data] -timer dd 0, 0 -[section .text] - -[global _gettsc] -_gettsc: - rdtsc - ret - -[global _rdtsc] -_rdtsc: - rdtsc - sub eax,[timer] - sbb edx,[timer+4] - ret - -[global _reset] -_reset: - push eax - push edx - rdtsc - mov [timer],eax - mov [timer+4],edx - pop edx - pop eax - ret \ No newline at end of file +; Simple RDTSC reader for NASM +; +; build with "nasm -f ___ timer.asm" where ___ is coff or elf [or whatever] +; +; Most *nix installs use elf so it would be "nasm -f elf timer.asm" +; +; Tom St Denis +[bits 32] +[section .data] +timer dd 0, 0 +[section .text] + +[global _gettsc] +_gettsc: + rdtsc + ret + +[global _rdtsc] +_rdtsc: + rdtsc + sub eax,[timer] + sbb edx,[timer+4] + ret + +[global _reset] +_reset: + push eax + push edx + rdtsc + mov [timer],eax + mov [timer+4],edx + pop edx + pop eax + ret